From 6e5bff1c69bdba16837714a13dedb61ca0a370a7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 9 Nov 2023 14:39:40 -0800 Subject: [PATCH 01/71] Update 'store_object' method signature with default None values --- src/hashstore/filehashstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 87f652e7..9732575b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -404,8 +404,8 @@ def lookup_algo(algo): def store_object( self, - pid, - data, + pid=None, + data=None, additional_algorithm=None, checksum=None, checksum_algorithm=None, From efc116e6e2d8b132a7103b1baf7e1e5a054835c4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 9 Nov 2023 15:26:27 -0800 Subject: [PATCH 02/71] Add new private method '_store_data' --- src/hashstore/filehashstore.py | 37 ++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 9732575b..85141c2b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -467,6 +467,43 @@ def store_object( return object_metadata + def _store_data(self, data): + """Store a temporary object to HashStore that is ready to be tagged, and return the + tmp file name and a hex digest dictionary of the default algorithms. + """ + logging.debug("FileHashStore - store_object: Request to store object.") + + # Step 1: Store data + try: + # Ensure the data is a stream + stream = Stream(data) + + # Get the hex digest dictionary + with closing(stream): + ( + object_ref_pid_location, + obj_file_size, + hex_digest_dict, + ) = self._move_and_get_checksums(None, stream) + + object_metadata = ObjectMetadata( + object_ref_pid_location, obj_file_size, hex_digest_dict + ) + # The permanent address of the data stored is based on the data's checksum + cid = hex_digest_dict.get(self.algorithm) + logging.debug( + "FileHashStore - store_object: Successfully stored object with cid: %s", + cid, + ) + return object_metadata + # pylint: disable=W0718 + except Exception as err: + exception_string = ( + "FileHashStore - store_object: failed to store object." + + f" Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + def store_metadata(self, pid, metadata, format_id=None): logging.debug( "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid From 3831baca91dac8f3ece25c7b77e542a269e647b1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 9 Nov 2023 15:29:09 -0800 Subject: [PATCH 03/71] Refactor 'store_object' to only store data when pid is 'None' --- src/hashstore/filehashstore.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 85141c2b..8917843c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -444,14 +444,17 @@ def store_object( "FileHashStore - store_object: Attempting to store object for pid: %s", pid, ) - object_metadata = self.put_object( - pid, - data, - additional_algorithm=additional_algorithm_checked, - checksum=checksum, - checksum_algorithm=checksum_algorithm_checked, - file_size_to_validate=expected_object_size, - ) + if pid is None: + object_metadata = self._store_data(data) + else: + object_metadata = self.put_object( + pid, + data, + additional_algorithm=additional_algorithm_checked, + checksum=checksum, + checksum_algorithm=checksum_algorithm_checked, + file_size_to_validate=expected_object_size, + ) finally: # Release pid with self.object_lock: From 90c32391f2f866b691ce8c94123a5b3fddeb333b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 09:52:53 -0800 Subject: [PATCH 04/71] Refactor '_move_and_get_checksums' to store objects with their content identifiers --- src/hashstore/filehashstore.py | 122 ++++++++++++++++++--------------- 1 file changed, 68 insertions(+), 54 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 8917843c..16002f48 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -470,43 +470,6 @@ def store_object( return object_metadata - def _store_data(self, data): - """Store a temporary object to HashStore that is ready to be tagged, and return the - tmp file name and a hex digest dictionary of the default algorithms. - """ - logging.debug("FileHashStore - store_object: Request to store object.") - - # Step 1: Store data - try: - # Ensure the data is a stream - stream = Stream(data) - - # Get the hex digest dictionary - with closing(stream): - ( - object_ref_pid_location, - obj_file_size, - hex_digest_dict, - ) = self._move_and_get_checksums(None, stream) - - object_metadata = ObjectMetadata( - object_ref_pid_location, obj_file_size, hex_digest_dict - ) - # The permanent address of the data stored is based on the data's checksum - cid = hex_digest_dict.get(self.algorithm) - logging.debug( - "FileHashStore - store_object: Successfully stored object with cid: %s", - cid, - ) - return object_metadata - # pylint: disable=W0718 - except Exception as err: - exception_string = ( - "FileHashStore - store_object: failed to store object." - + f" Unexpected {err=}, {type(err)=}" - ) - logging.error(exception_string) - def store_metadata(self, pid, metadata, format_id=None): logging.debug( "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid @@ -696,7 +659,7 @@ def put_object( Returns: object_metadata (ObjectMetadata): object that contains the object id, - object file size, duplicate file boolean and hex digest dictionary. + object file size and hex digest dictionary. """ stream = Stream(file) @@ -725,6 +688,61 @@ def put_object( ) return object_metadata + def _store_data(self, data): + """Store an object to HashStore and return the tmp file name and a hex digest + dictionary of the default algorithms. + + Args: + data (mixed): String or path to object. + + Raises: + IOError: If object fails to store + FileExistsError: If file already exists + + Returns: + object_metadata (ObjectMetadata): object that contains the object id, + object file size and hex digest dictionary. + """ + logging.debug("FileHashStore - store_object: Request to store object.") + + # TODO: Missing Tests + # - Test that this method returns hex digests and that they are correct + # - Test that objects are actually stored with their cid + # - Test that exception is raised when object fails to store + # - Test that exception is raised when object already exists + # - Test providing the data as a file path + # - Test providing the data as a stream + try: + # Ensure the data is a stream + stream = Stream(data) + + # Get the hex digest dictionary + with closing(stream): + ( + object_ref_pid_location, + obj_file_size, + hex_digest_dict, + ) = self._move_and_get_checksums(None, stream) + + object_metadata = ObjectMetadata( + object_ref_pid_location, obj_file_size, hex_digest_dict + ) + # The permanent address of the data stored is based on the data's checksum + cid = hex_digest_dict.get(self.algorithm) + logging.debug( + "FileHashStore - store_object: Successfully stored object with cid: %s", + cid, + ) + return object_metadata + # pylint: disable=W0718 + except Exception as err: + exception_string = ( + "FileHashStore - store_object: failed to store object." + + f" Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise IOError(exception_string) from err + def _move_and_get_checksums( self, pid, @@ -756,21 +774,11 @@ def _move_and_get_checksums( file_size_to_validate (bytes, optional): Expected size of object Returns: - object_metadata (tuple): object id, object file size, duplicate file - boolean and hex digest dictionary. + object_metadata (tuple): object id, object file size and hex digest dictionary. """ - entity = "objects" - object_cid = self.get_sha256_hex_digest(pid) - abs_file_path = self.build_abs_path(entity, object_cid, extension) - - # Only create tmp file to be moved if target destination doesn't exist - if os.path.isfile(abs_file_path): - exception_string = ( - "FileHashStore - _move_and_get_checksums: File already exists" - + f" for pid: {pid} at {abs_file_path}" - ) - logging.error(exception_string) - raise FileExistsError(exception_string) + # If the checksum algorithm is the same as the store algorithm, then we can + # determine whether the object exists or not to be efficient + # TODO # Create temporary file and calculate hex digests debug_msg = ( @@ -786,6 +794,11 @@ def _move_and_get_checksums( tmp_file_name, ) + # Objects are stored with their content identifier based on the store algorithm + entity = "objects" + object_cid = hex_digests.get(self.algorithm) + abs_file_path = self.build_abs_path(entity, object_cid, extension) + # Only move file if it doesn't exist. # Files are stored once and only once if not os.path.isfile(abs_file_path): @@ -850,12 +863,13 @@ def _move_and_get_checksums( raise else: # Else delete temporary file - warning_msg = ( + exception_string = ( f"FileHashStore - _move_and_get_checksums: Object exists at: {abs_file_path}," + " deleting temporary file." ) - logging.warning(warning_msg) + logging.error(exception_string) self.delete(entity, tmp_file_name) + raise FileExistsError(exception_string) return (object_cid, tmp_file_size, hex_digests) From f6a5cd17c249fd5d74790c6db28143d4ec97baaf Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 10:02:25 -0800 Subject: [PATCH 05/71] Update tests after store_object refactor to store with object's content identifier --- tests/test_filehashstore.py | 5 ++-- tests/test_filehashstore_interface.py | 36 +++++++++++---------------- tests/test_hashstore_client.py | 2 +- 3 files changed, 18 insertions(+), 25 deletions(-) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index a2f0fdfe..6331ba5d 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -245,7 +245,7 @@ def test_put_object_cid(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.put_object(pid, path) object_metadata_id = object_metadata.id - assert object_metadata_id == pids[pid]["object_cid"] + assert object_metadata_id == pids[pid][store.algorithm] def test_put_object_file_size(pids, store): @@ -321,8 +321,7 @@ def test_move_and_get_checksums_id(pids, store): _, ) = store._move_and_get_checksums(pid, input_stream) input_stream.close() - object_cid = store.get_sha256_hex_digest(pid) - assert move_id == object_cid + assert move_id == pids[pid][store.algorithm] def test_move_and_get_checksums_file_size(pids, store): diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 92b125cb..329af168 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -41,7 +41,7 @@ def test_store_object(pids, store): syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) - assert object_metadata.id == pids[pid]["object_cid"] + assert object_metadata.id == pids[pid][store.algorithm] assert store.count(entity) == 3 @@ -56,7 +56,7 @@ def test_store_object_files_path(pids, store): syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) - assert store.exists(entity, pids[pid]["object_cid"]) + assert store.exists(entity, pids[pid][store.algorithm]) assert store.count(entity) == 3 @@ -71,7 +71,7 @@ def test_store_object_files_string(pids, store): syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path_string) _metadata_cid = store.store_metadata(pid, syspath, format_id) - assert store.exists(entity, pids[pid]["object_cid"]) + assert store.exists(entity, pids[pid][store.algorithm]) assert store.count(entity) == 3 @@ -84,18 +84,17 @@ def test_store_object_files_input_stream(pids, store): input_stream = io.open(path, "rb") _object_metadata = store.store_object(pid, input_stream) input_stream.close() - object_cid = store.get_sha256_hex_digest(pid) - assert store.exists(entity, object_cid) + assert store.exists(entity, pids[pid][store.algorithm]) assert store.count(entity) == 3 def test_store_object_id(pids, store): - """Test store object returns expected id (object_cid).""" + """Test store object returns expected id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) - assert object_metadata.id == pids[pid]["object_cid"] + assert object_metadata.id == pids[pid][store.algorithm] def test_store_object_obj_size(pids, store): @@ -192,11 +191,10 @@ def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): object_metadata = store.store_object(pid, path, algorithm_with_hyphen_and_upper) sha256_cid = object_metadata.hex_digests.get("sha384") assert sha256_cid == pids[pid]["sha384"] - object_cid = store.get_sha256_hex_digest(pid) - assert store.exists(entity, object_cid) + assert store.exists(entity, pids[pid][store.algorithm]) -def test_store_object_additional_algorithm_hyphen_lowercase(store): +def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): """Test store object with additional algorithm in lowercase.""" test_dir = "tests/testdata/" entity = "objects" @@ -209,11 +207,10 @@ def test_store_object_additional_algorithm_hyphen_lowercase(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) assert additional_sha3_256_hex_digest == sha3_256_checksum - object_cid = store.get_sha256_hex_digest(pid) - assert store.exists(entity, object_cid) + assert store.exists(entity, pids[pid][store.algorithm]) -def test_store_object_additional_algorithm_underscore(store): +def test_store_object_additional_algorithm_underscore(pids, store): """Test store object with additional algorithm with underscore.""" test_dir = "tests/testdata/" entity = "objects" @@ -226,8 +223,7 @@ def test_store_object_additional_algorithm_underscore(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) assert additional_sha3_256_hex_digest == sha3_256_checksum - pid_hash = store.get_sha256_hex_digest(pid) - assert store.exists(entity, pid_hash) + assert store.exists(entity, pids[pid][store.algorithm]) def test_store_object_checksum_correct(store): @@ -356,7 +352,7 @@ def test_store_object_checksum_incorrect_checksum(store): ) -def test_store_object_duplicate_raises_error(store): +def test_store_object_duplicate_raises_error(pids, store): """Test store duplicate object throws FileExistsError.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" @@ -368,8 +364,7 @@ def test_store_object_duplicate_raises_error(store): with pytest.raises(FileExistsError): _object_metadata_two = store.store_object(pid, path) assert store.count(entity) == 1 - object_cid = store.get_sha256_hex_digest(pid) - assert store.exists(entity, object_cid) + assert store.exists(entity, pids[pid][store.algorithm]) def test_store_object_with_obj_file_size(store, pids): @@ -415,7 +410,7 @@ def test_store_object_with_obj_file_size_zero(store, pids): store.store_object(pid, path, expected_object_size=obj_file_size) -def test_store_object_duplicates_threads(store): +def test_store_object_duplicates_threads(pids, store): """Test store object thread lock.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" @@ -442,8 +437,7 @@ def store_object_wrapper(pid, path): thread3.join() # One thread will succeed, file count must still be 1 assert store.count(entity) == 1 - object_cid = store.get_sha256_hex_digest(pid) - assert store.exists(entity, object_cid) + assert store.exists(entity, pids[pid][store.algorithm]) assert file_exists_error_flag diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 7d73e524..f3f24477 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -66,7 +66,7 @@ def test_store_object(store, pids): sys.argv = chs_args client.main() - assert store.exists("objects", pids[pid]["object_cid"]) + assert store.exists("objects", pids[pid][store.algorithm]) def test_store_metadata(store, pids): From 5a69001d12c62531a683f6c08f2f2cef0f20f367 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 10:15:25 -0800 Subject: [PATCH 06/71] Update HashStore interface documentation --- src/hashstore/hashstore.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 6c704209..9b091ae1 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -5,9 +5,8 @@ class HashStore(ABC): - """HashStore is a content-addressable file management system that - utilizes a persistent identifier (PID) in the form of a hex digest - value to address files.""" + """HashStore is a content-addressable file management system that utilizes + an object's content identifier (hex digest/checksum) to address files.""" @staticmethod def version(): @@ -26,16 +25,15 @@ def store_object( expected_object_size, ): """The `store_object` method is responsible for the atomic storage of objects to - disk using a given InputStream and a persistent identifier (pid). Upon - successful storage, the method returns a ObjectMetadata object containing - relevant file information, such as the file's id (which can be used to locate the - object on disk), the file's size, and a hex digest map of algorithms and checksums. - `store_object` also ensures that an object is stored only once by synchronizing - multiple calls and rejecting calls to store duplicate objects. - - The file's id is determined by calculating the SHA-256 hex digest of the - provided pid, which is also used as the permanent address of the file. The - file's identifier is then sharded using a depth of 3 and width of 2, + disk using a given stream. Upon successful storage, the method returns a ObjectMetadata + object containing relevant file information, such as the file's id (which can be + used to locate the object on disk), the file's size, and a hex digest map of algorithms + and checksums. `store_object` also ensures that an object is stored only once by + synchronizing multiple calls and rejecting calls to store duplicate objects. + + The file's id is determined by calculating the object's content identifier based on + the store's default algorithm, which is also used as the permanent address of the file. + The file's identifier is then sharded using a depth of 3 and width of 2, delimited by '/' and concatenated to produce the final permanent address and is stored in the `/store_directory/objects/` directory. @@ -61,7 +59,7 @@ def store_object( Returns: object_metadata (ObjectMetadata): Object that contains the permanent address, - file size, duplicate file boolean and hex digest dictionary. + file size and hex digest dictionary. """ raise NotImplementedError() From d18eba9662c0e71b9f28bef167f7f526a23b5d51 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 10:58:49 -0800 Subject: [PATCH 07/71] Add new public API methods to HashStore interface 'tag_object' and 'find_object' --- src/hashstore/hashstore.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 9b091ae1..130c1304 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -63,6 +63,35 @@ def store_object( """ raise NotImplementedError() + @abstractmethod + def tag_object(self, pid, cid): + """The `tag_object` method creates references that allow objects stored in HashStore + to be discoverable. Retrieving, deleting or calculating a hex digest of an object is + based on a pid argument; and to proceed, we must be able to find the object associated + with the pid. + + Args: + pid (string): Authority-based or persistent identifier of object + cid (string): Content identifier of object + + Returns: + boolean: `True` upon successful tagging. + """ + raise NotImplementedError() + + @abstractmethod + def find_object(self, pid): + """The `find_object` method checks whether an object referenced by a pid exists + and returns the content identifier. + + Args: + pid (string): Authority-based or persistent identifier of object + + Returns: + cid (string): Content identifier of the object + """ + raise NotImplementedError() + @abstractmethod def store_metadata(self, pid, metadata, format_id): """The `store_metadata` method is responsible for adding and/or updating metadata @@ -87,9 +116,8 @@ def store_metadata(self, pid, metadata, format_id): @abstractmethod def retrieve_object(self, pid): """The `retrieve_object` method retrieves an object from disk using a given - persistent identifier (pid). If the object exists (determined by calculating - the object's permanent address using the SHA-256 hash of the given pid), the - method will open and return a buffered object stream ready to read from. + persistent identifier (pid). If the object exists, the method will open and return + a buffered object stream ready to read from. Args: pid (string): Authority-based identifier. From 78f84c3b08b27297a33256e2a74b08e5829ad809 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 11:04:04 -0800 Subject: [PATCH 08/71] Update HashStore initialization to create required 'refs' directory and subdirectories --- src/hashstore/filehashstore.py | 10 ++++++++++ tests/test_filehashstore.py | 3 +++ 2 files changed, 13 insertions(+) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 16002f48..fe6402b6 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -104,10 +104,14 @@ def __init__(self, properties=None): # Complete initialization/instantiation by setting and creating store directories self.objects = self.root + "/objects" self.metadata = self.root + "/metadata" + self.refs = self.root + "/refs" if not os.path.exists(self.objects): self.create_path(self.objects + "/tmp") if not os.path.exists(self.metadata): self.create_path(self.metadata + "/tmp") + if not os.path.exists(self.refs): + self.create_path(self.refs + "/pids") + self.create_path(self.refs + "/cids") logging.debug( "FileHashStore - Initialization success. Store root: %s", self.root ) @@ -470,6 +474,12 @@ def store_object( return object_metadata + def tag_object(self, pid, cid): + return + + def find_object(self, pid): + return + def store_metadata(self, pid, metadata, format_id=None): logging.debug( "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 6331ba5d..778725b0 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -18,6 +18,9 @@ def test_init_directories_created(store): assert os.path.exists(store.objects + "/tmp") assert os.path.exists(store.metadata) assert os.path.exists(store.metadata + "/tmp") + assert os.path.exists(store.refs) + assert os.path.exists(store.refs + "/pids") + assert os.path.exists(store.refs + "/cids") def test_init_existing_store_incorrect_algorithm_format(store): From 0af3514b6e0ad1059cd43735e15e5beb8fd5be3b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 11:14:57 -0800 Subject: [PATCH 09/71] Add TODOs and pseudo code in 'FileHashStore' --- src/hashstore/filehashstore.py | 25 +++++++++++++++++++++---- tests/test_filehashstore.py | 4 ++-- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index fe6402b6..8a581cc7 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -110,8 +110,8 @@ def __init__(self, properties=None): if not os.path.exists(self.metadata): self.create_path(self.metadata + "/tmp") if not os.path.exists(self.refs): - self.create_path(self.refs + "/pids") - self.create_path(self.refs + "/cids") + self.create_path(self.refs + "/pid") + self.create_path(self.refs + "/cid") logging.debug( "FileHashStore - Initialization success. Store root: %s", self.root ) @@ -475,9 +475,19 @@ def store_object( return object_metadata def tag_object(self, pid, cid): + # Synchronize tag_object with a lock + # Acquire system-wide file lock on the cid to be evaluated + # Check to see if reference file already exists for the cid + # If it does, read the file and add the new pid on its own line + # If not, create the cid ref file '.../refs/cid' with the first line being the pid + # Then create the pid ref file in '.../refs/pid' with the cid as its content + # Release system-wide file lock on the cid + # Release initial lock return def find_object(self, pid): + # Get the path to the pid reference by calculating its hash in '.../refs/pid' + # Read the file to get the cid from the pid reference and return it return def store_metadata(self, pid, metadata, format_id=None): @@ -533,6 +543,8 @@ def retrieve_object(self, pid): ) self._is_string_none_or_empty(pid, "pid", "retrieve_object") + # TODO: Find object from the pid reference file + entity = "objects" object_cid = self.get_sha256_hex_digest(pid) object_exists = self.exists(entity, object_cid) @@ -586,6 +598,10 @@ def delete_object(self, pid): ) self._is_string_none_or_empty(pid, "pid", "delete_object") + # TODO: Also find the reference file and delete it if there's only one ref + # Else delete the pid in the cid refs file + # Also delete the pid ref file + entity = "objects" object_cid = self.get_sha256_hex_digest(pid) self.delete(entity, object_cid) @@ -622,6 +638,8 @@ def get_hex_digest(self, pid, algorithm): self._is_string_none_or_empty(pid, "pid", "get_hex_digest") self._is_string_none_or_empty(algorithm, "algorithm", "get_hex_digest") + # TODO: Find object from the pid reference file + entity = "objects" algorithm = self.clean_algorithm(algorithm) object_cid = self.get_sha256_hex_digest(pid) @@ -786,9 +804,8 @@ def _move_and_get_checksums( Returns: object_metadata (tuple): object id, object file size and hex digest dictionary. """ - # If the checksum algorithm is the same as the store algorithm, then we can + # TODO: If the checksum algorithm is the same as the store algorithm, then we can # determine whether the object exists or not to be efficient - # TODO # Create temporary file and calculate hex digests debug_msg = ( diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 778725b0..d0894978 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -19,8 +19,8 @@ def test_init_directories_created(store): assert os.path.exists(store.metadata) assert os.path.exists(store.metadata + "/tmp") assert os.path.exists(store.refs) - assert os.path.exists(store.refs + "/pids") - assert os.path.exists(store.refs + "/cids") + assert os.path.exists(store.refs + "/pid") + assert os.path.exists(store.refs + "/cid") def test_init_existing_store_incorrect_algorithm_format(store): From 6a32c457b5bf8c39f3cc437493ecd175d13f5a7c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 12:24:48 -0800 Subject: [PATCH 10/71] Rename 'put_object' method to '_store_and_validate_data' and update tests, and '_store_data' to 'store_data_only' --- src/hashstore/filehashstore.py | 13 ++++--- tests/test_filehashstore.py | 70 ++++++++++++++++++---------------- 2 files changed, 45 insertions(+), 38 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 8a581cc7..62c455eb 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -449,9 +449,9 @@ def store_object( pid, ) if pid is None: - object_metadata = self._store_data(data) + object_metadata = self.store_data_only(data) else: - object_metadata = self.put_object( + object_metadata = self.store_and_validate_data( pid, data, additional_algorithm=additional_algorithm_checked, @@ -661,7 +661,7 @@ def get_hex_digest(self, pid, algorithm): # FileHashStore Core Methods - def put_object( + def store_and_validate_data( self, pid, file, @@ -671,7 +671,8 @@ def put_object( checksum_algorithm=None, file_size_to_validate=None, ): - """Store contents of `file` on disk using the hash of the given pid + """Store contents of `file` on disk using, validate the object's parameters if + provided and tag/reference the object. Args: pid (string): Authority-based identifier. \n @@ -716,8 +717,8 @@ def put_object( ) return object_metadata - def _store_data(self, data): - """Store an object to HashStore and return the tmp file name and a hex digest + def store_data_only(self, data): + """Store an object to HashStore and return the id and a hex digest dictionary of the default algorithms. Args: diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index d0894978..8ddf75fb 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -112,7 +112,7 @@ def test_init_with_existing_hashstore_missing_yaml(store, pids): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - store.put_object(pid, path) + store.store_and_validate_data(pid, path) os.remove(store.hashstore_configuration_yaml) properties = { "store_path": store.root, @@ -198,75 +198,75 @@ def test_set_default_algorithms_missing_yaml(store, pids): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - store.put_object(pid, path) + store.store_and_validate_data(pid, path) os.remove(store.hashstore_configuration_yaml) with pytest.raises(FileNotFoundError): # pylint: disable=W0212 store._set_default_algorithms() -def test_put_object_files_path(pids, store): +def test_store_and_validate_data_files_path(pids, store): """Test put objects with path object.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = Path(test_dir) / pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_id = object_metadata.id assert store.exists(entity, object_metadata_id) -def test_put_object_files_string(pids, store): +def test_store_and_validate_data_files_string(pids, store): """Test put objects with string.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_id = object_metadata.id assert store.exists(entity, object_metadata_id) -def test_put_object_files_stream(pids, store): +def test_store_and_validate_data_files_stream(pids, store): """Test put objects with stream.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - object_metadata = store.put_object(pid, input_stream) + object_metadata = store.store_and_validate_data(pid, input_stream) input_stream.close() object_metadata_id = object_metadata.id assert store.exists(entity, object_metadata_id) assert store.count(entity) == 3 -def test_put_object_cid(pids, store): +def test_store_and_validate_data_cid(pids, store): """Check put returns correct id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_id = object_metadata.id assert object_metadata_id == pids[pid][store.algorithm] -def test_put_object_file_size(pids, store): +def test_store_and_validate_data_file_size(pids, store): """Check put returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_size = object_metadata.obj_size assert object_size == pids[pid]["file_size_bytes"] -def test_put_object_hex_digests(pids, store): +def test_store_and_validate_data_hex_digests(pids, store): """Check put successfully generates hex digests dictionary.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_hex_digests = object_metadata.hex_digests assert object_metadata_hex_digests.get("md5") == pids[pid]["md5"] assert object_metadata_hex_digests.get("sha1") == pids[pid]["sha1"] @@ -275,30 +275,34 @@ def test_put_object_hex_digests(pids, store): assert object_metadata_hex_digests.get("sha512") == pids[pid]["sha512"] -def test_put_object_additional_algorithm(pids, store): - """Check put_object returns additional algorithm in hex digests.""" +def test_store_and_validate_data_additional_algorithm(pids, store): + """Check store_and_validate_data returns additional algorithm in hex digests.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path, additional_algorithm=algo) + object_metadata = store.store_and_validate_data( + pid, path, additional_algorithm=algo + ) hex_digests = object_metadata.hex_digests sha224_hash = hex_digests.get(algo) assert sha224_hash == pids[pid][algo] -def test_put_object_with_correct_checksums(pids, store): - """Check put_object success with valid checksum and checksum algorithm supplied.""" +def test_store_and_validate_data_with_correct_checksums(pids, store): + """Check store_and_validate_data success with valid checksum and checksum algorithm supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" algo_checksum = pids[pid][algo] path = test_dir + pid.replace("/", "_") - store.put_object(pid, path, checksum=algo_checksum, checksum_algorithm=algo) + store.store_and_validate_data( + pid, path, checksum=algo_checksum, checksum_algorithm=algo + ) assert store.count("objects") == 3 -def test_put_object_with_incorrect_checksum(pids, store): +def test_store_and_validate_data_with_incorrect_checksum(pids, store): """Check put fails when bad checksum supplied.""" test_dir = "tests/testdata/" entity = "objects" @@ -307,7 +311,9 @@ def test_put_object_with_incorrect_checksum(pids, store): algo_checksum = "badChecksumValue" path = test_dir + pid.replace("/", "_") with pytest.raises(ValueError): - store.put_object(pid, path, checksum=algo_checksum, checksum_algorithm=algo) + store.store_and_validate_data( + pid, path, checksum=algo_checksum, checksum_algorithm=algo + ) assert store.count(entity) == 0 @@ -634,7 +640,7 @@ def test_exists_with_object_metadata_id(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) assert store.exists(entity, object_metadata.id) @@ -644,7 +650,7 @@ def test_exists_with_sharded_path(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_shard = store.shard(object_metadata.id) object_metadata_shard_path = "/".join(object_metadata_shard) assert store.exists(entity, object_metadata_shard_path) @@ -677,7 +683,7 @@ def test_open_objects(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_id = object_metadata.id io_buffer = store.open(entity, object_metadata_id) assert isinstance(io_buffer, io.BufferedReader) @@ -690,7 +696,7 @@ def test_delete_by_object_metadata_id(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_id = object_metadata.id store.delete(entity, object_metadata_id) assert store.count(entity) == 0 @@ -739,7 +745,7 @@ def test_remove_empty_does_not_remove_nonempty_folders(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_shard = store.shard(object_metadata.id) object_metadata_shard_path = "/".join(object_metadata_shard) # Get parent directory of the relative path @@ -802,7 +808,7 @@ def test_get_real_path_with_object_id(store, pids): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) obj_abs_path = store.get_real_path(entity, object_metadata.id) assert os.path.exists(obj_abs_path) @@ -813,7 +819,7 @@ def test_get_real_path_with_object_id_sharded(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_shard = store.shard(object_metadata.id) object_metadata_shard_path = "/".join(object_metadata_shard) obj_abs_path = store.get_real_path(entity, object_metadata_shard_path) @@ -839,7 +845,7 @@ def test_get_real_path_with_bad_entity(store, pids): entity = "bad_entity" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) with pytest.raises(ValueError): store.get_real_path(entity, object_metadata.id) @@ -850,7 +856,7 @@ def test_build_abs_path(store, pids): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - _ = store.put_object(pid, path) + _ = store.store_and_validate_data(pid, path) # pylint: disable=W0212 abs_path = store.build_abs_path(entity, pids[pid]["object_cid"]) assert abs_path @@ -862,7 +868,7 @@ def test_count(pids, store): entity = "objects" for pid in pids.keys(): path_string = test_dir + pid.replace("/", "_") - store.put_object(pid, path_string) + store.store_and_validate_data(pid, path_string) assert store.count(entity) == 3 From 270e556725f4ec3805cff9c10341b7b994b73502 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 12:40:06 -0800 Subject: [PATCH 11/71] Add reference locks and skeleton code for 'tag_object' --- src/hashstore/filehashstore.py | 43 +++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 62c455eb..98cfdff9 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -61,8 +61,10 @@ class FileHashStore(HashStore): time_out_sec = 1 object_lock = threading.Lock() metadata_lock = threading.Lock() + reference_lock = threading.Lock() object_locked_pids = [] metadata_locked_pids = [] + reference_locked_cids = [] def __init__(self, properties=None): if properties: @@ -475,14 +477,39 @@ def store_object( return object_metadata def tag_object(self, pid, cid): - # Synchronize tag_object with a lock - # Acquire system-wide file lock on the cid to be evaluated - # Check to see if reference file already exists for the cid - # If it does, read the file and add the new pid on its own line - # If not, create the cid ref file '.../refs/cid' with the first line being the pid - # Then create the pid ref file in '.../refs/pid' with the cid as its content - # Release system-wide file lock on the cid - # Release initial lock + # Wait for the cid to release if it's being tagged + while cid in self.reference_locked_cids: + logging.debug( + "FileHashStore - tag_object: (cid) %s is currently being tagged. Waiting.", + cid, + ) + time.sleep(self.time_out_sec) + # Modify reference_locked_cids consecutively + with self.reference_lock: + logging.debug( + "FileHashStore - tag_object: Adding cid: %s to reference_locked_cids.", + cid, + ) + self.reference_locked_cids.append(cid) + try: + # Acquire system-wide file lock on the cid to be evaluated + # Check to see if reference file already exists for the cid + # If it does, read the file and add the new pid on its own line + # If not, create the cid ref file '.../refs/cid' with the first line being the pid + # Then create the pid ref file in '.../refs/pid' with the cid as its content + # Release system-wide file lock on the cid + # Release initial lock + print("Tag object") + finally: + # Release pid + with self.reference_lock: + logging.debug( + "FileHashStore - tag_object: Removing cid: %s from reference_locked_cids.", + cid, + ) + self.reference_locked_cids.remove(cid) + info_msg = f"FileHashStore - tag_object: Successfully tagged cid: {cid} with pid: {pid}" + logging.info(info_msg) return def find_object(self, pid): From d20f41e37ab46e7514d64aa955c8d320fa701d68 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 13:31:59 -0800 Subject: [PATCH 12/71] Fill out 'tag_object' skeleton, update 'get_store_path' method for 'refs' and add new empty method '_write_cid_reference' --- src/hashstore/filehashstore.py | 38 +++++++++++++++++++++++----------- tests/test_filehashstore.py | 8 +++++++ 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 98cfdff9..53741789 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -492,16 +492,18 @@ def tag_object(self, pid, cid): ) self.reference_locked_cids.append(cid) try: - # Acquire system-wide file lock on the cid to be evaluated # Check to see if reference file already exists for the cid - # If it does, read the file and add the new pid on its own line - # If not, create the cid ref file '.../refs/cid' with the first line being the pid - # Then create the pid ref file in '.../refs/pid' with the cid as its content - # Release system-wide file lock on the cid - # Release initial lock - print("Tag object") + entity = "refs" + ref_abs_path = self.build_abs_path(entity, cid) + if os.path.isfile(ref_abs_path): + # If it does, read the file and add the new pid on its own line + print("Add pid to reference file") + else: + # If not, create the cid ref file '.../refs/cid' with the first line being the pid + # Then create the pid ref file in '.../refs/pid' with the cid as its content + print("Create and tag reference file") finally: - # Release pid + # Release cid with self.reference_lock: logging.debug( "FileHashStore - tag_object: Removing cid: %s from reference_locked_cids.", @@ -1030,6 +1032,16 @@ def delete_tmp_file(): ) logging.error(exception_string) + def _write_cid_reference(self, pid, cid): + """Write the reference file for the given content identifier (cid). A reference + file contains every pid that references a cid on a new line. + + Args: + pid (string): Authority-based or persistent identifier of object + cid (string): Content identifier of object + """ + print("Writing reference") + def put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the given pid and format_id as the permanent address. @@ -1381,9 +1393,11 @@ def get_store_path(self, entity): return Path(self.objects) elif entity == "metadata": return Path(self.metadata) + elif entity == "refs": + return Path(self.refs) else: raise ValueError( - f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" + f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" ) def exists(self, entity, file): @@ -1554,18 +1568,18 @@ def get_real_path(self, entity, file): # Could not determine a match. return None - def build_abs_path(self, entity, cid, extension=""): + def build_abs_path(self, entity, hash_id, extension=""): """Build the absolute file path for a given hash id with an optional file extension. Args: entity (str): Desired entity type (ex. "objects", "metadata"). \n - cid (str): A hash id to build a file path for. \n + hash_id (str): A hash id to build a file path for. \n extension (str): An optional file extension to append to the file path. Returns: absolute_path (str): An absolute file path for the specified hash id. """ - paths = self.shard(cid) + paths = self.shard(hash_id) root_dir = self.get_store_path(entity) if extension and not extension.startswith(os.extsep): diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 8ddf75fb..f3184c20 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -634,6 +634,14 @@ def test_get_store_path_metadata(store): assert path_metadata_string.endswith("/metacat/metadata") +def test_get_store_path_refs(store): + """Check get_store_path for refs path.""" + # pylint: disable=W0212 + path_metadata = store.get_store_path("refs") + path_metadata_string = str(path_metadata) + assert path_metadata_string.endswith("/metacat/refs") + + def test_exists_with_object_metadata_id(pids, store): """Test exists method with an absolute file path.""" test_dir = "tests/testdata/" From 73a2c66d7086fbfbd64bcdf0f14b3e718b5d196c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 14:17:16 -0800 Subject: [PATCH 13/71] Fix test for 'build_abs_path' in FileHashStore --- tests/test_filehashstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index f3184c20..a1f80fc2 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -866,8 +866,8 @@ def test_build_abs_path(store, pids): path = test_dir + pid.replace("/", "_") _ = store.store_and_validate_data(pid, path) # pylint: disable=W0212 - abs_path = store.build_abs_path(entity, pids[pid]["object_cid"]) - assert abs_path + abs_path = store.build_abs_path(entity, pids[pid][store.algorithm]) + assert os.path.exists(abs_path) def test_count(pids, store): From f44871ece293efe01e12f189d8ae421fdafaef54 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 10 Nov 2023 15:39:33 -0800 Subject: [PATCH 14/71] Add new fcntl import, code method 'write_cid_reference' and add new pytests --- src/hashstore/filehashstore.py | 39 ++++++++++++++++++++++++++++------ tests/test_filehashstore.py | 25 ++++++++++++++++++++++ 2 files changed, 57 insertions(+), 7 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 53741789..ec8e9804 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -10,6 +10,7 @@ from pathlib import Path from contextlib import closing from tempfile import NamedTemporaryFile +import fcntl import yaml from hashstore import HashStore, ObjectMetadata @@ -494,14 +495,18 @@ def tag_object(self, pid, cid): try: # Check to see if reference file already exists for the cid entity = "refs" - ref_abs_path = self.build_abs_path(entity, cid) - if os.path.isfile(ref_abs_path): + cid_ref_abs_path = self.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + if os.path.exists(cid_ref_abs_path): # If it does, read the file and add the new pid on its own line print("Add pid to reference file") else: - # If not, create the cid ref file '.../refs/cid' with the first line being the pid + # If not, create the cid ref file in '.../refs/cid' and write the pid + self.create_path(os.path.dirname(cid_ref_abs_path)) + self.write_cid_reference(cid_ref_abs_path, pid) # Then create the pid ref file in '.../refs/pid' with the cid as its content - print("Create and tag reference file") + # TODO: Write the pid ref file that contains the cid finally: # Release cid with self.reference_lock: @@ -1032,15 +1037,35 @@ def delete_tmp_file(): ) logging.error(exception_string) - def _write_cid_reference(self, pid, cid): + def write_cid_reference(self, cid_ref_abs_path, pid): """Write the reference file for the given content identifier (cid). A reference file contains every pid that references a cid on a new line. Args: + cid_ref_abs_path (string): Absolute path to the cid ref file pid (string): Authority-based or persistent identifier of object - cid (string): Content identifier of object """ - print("Writing reference") + info_msg = ( + f"FileHashStore - _write_cid_reference: Writing pid ({pid}) into cid reference" + + f" file: {cid_ref_abs_path}" + ) + logging.info(info_msg) + + try: + with open(cid_ref_abs_path, "w", encoding="utf8") as cid_ref_file: + fcntl.flock(cid_ref_file, fcntl.LOCK_EX) + cid_ref_file.write(pid + "\n") + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) + return + except Exception as err: + exception_string = ( + "FileHashStore - _write_cid_reference: failed to write reference for cid:" + + f" {cid_ref_abs_path}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise IOError(exception_string) from err def put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index a1f80fc2..df008cf8 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -543,6 +543,31 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): input_stream.close() +def test_write_cid_reference(pids, store): + """Test that write_cid_reference writes a reference file""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_reference(cid_ref_abs_path, pid) + assert os.path.exists(cid_ref_abs_path) + + +def test_write_cid_reference_content(pids, store): + """Test that write_cid_reference writes the expected content""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_reference(cid_ref_abs_path, pid) + + def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" entity = "metadata" From 4c01344838dcd12ad4b9086236573e17f3429b8d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 09:30:32 -0800 Subject: [PATCH 15/71] Add missing assertion statement to 'write_cid_reference' test for verifying content --- tests/test_filehashstore.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index df008cf8..b9efdd4b 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -567,6 +567,11 @@ def test_write_cid_reference_content(pids, store): store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_reference(cid_ref_abs_path, pid) + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + cid_ref_file_pid = f.read() + + assert pid == cid_ref_file_pid.replace("\n", "") + def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" From 0178f28b2afeaf126485772586566437dd76e7ac Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 10:54:18 -0800 Subject: [PATCH 16/71] Add new method 'update_cid_reference' with new pytests --- src/hashstore/filehashstore.py | 42 ++++++++++++++++++++++++++++++- tests/test_filehashstore.py | 46 ++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index ec8e9804..06cf213b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1062,7 +1062,47 @@ def write_cid_reference(self, cid_ref_abs_path, pid): except Exception as err: exception_string = ( "FileHashStore - _write_cid_reference: failed to write reference for cid:" - + f" {cid_ref_abs_path}. Unexpected {err=}, {type(err)=}" + + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise IOError(exception_string) from err + + def update_cid_reference(self, cid_ref_abs_path, pid): + """Update an existing cid reference file with the given pid. Every pid in a reference + file is found on its own line. + + Args: + cid_ref_abs_path (string): Absolute path to the cid ref file + pid (string): Authority-based or persistent identifier of object + """ + info_msg = ( + f"FileHashStore - update_cid_reference: Adding pid ({pid}) into cid reference" + + f" file: {cid_ref_abs_path}" + ) + logging.info(info_msg) + + try: + with open(cid_ref_abs_path, "a+", encoding="utf8") as cid_ref_file: + fcntl.flock(cid_ref_file, fcntl.LOCK_EX) + # Read the ref file to see if the pid is already referencing the cid + cid_ref_file_content = cid_ref_file.read() + + if pid in cid_ref_file_content: + err_msg = ( + f"FileHashStore - update_cid_reference: pid ({pid}) already reference in" + + f" cid reference file: {cid_ref_abs_path} " + ) + raise ValueError(err_msg) + else: + cid_ref_file.write(pid + "\n") + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) + return + except Exception as err: + exception_string = ( + "FileHashStore - update_cid_reference: failed to update reference for cid:" + + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise IOError(exception_string) from err diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index b9efdd4b..a69baff5 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -573,6 +573,52 @@ def test_write_cid_reference_content(pids, store): assert pid == cid_ref_file_pid.replace("\n", "") +def test_update_cid_reference_content(pids, store): + """Test that update_cid_reference updates the ref file as expected""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_reference(cid_ref_abs_path, pid) + + pid_other = "dou.test.1" + store.update_cid_reference(cid_ref_abs_path, pid_other) + + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + assert value == pid or value == pid_other + + +def test_update_cid_reference_content_multiple(pids, store): + """Test that update_cid_reference multiple updates""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_reference(cid_ref_abs_path, pid) + + cid_reference_list = [pid] + for i in range(0, 5): + store.update_cid_reference(cid_ref_abs_path, f"dou.test.{i}") + cid_reference_list.append(f"dou.test.{i}") + + line_count = 0 + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + line_count += 1 + value = line.strip() + assert value in cid_reference_list + + assert line_count == 6 + + def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" entity = "metadata" From 21233afbac7c90e80390c66ad401febde6c5ed6e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 11:23:51 -0800 Subject: [PATCH 17/71] Add new 'delete_cid_reference_pid' method with new pytests --- src/hashstore/filehashstore.py | 49 ++++++++++++++++++++++++++++++++-- tests/test_filehashstore.py | 47 +++++++++++++++++++++++++++++--- 2 files changed, 90 insertions(+), 6 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 06cf213b..b1d7bc93 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1059,13 +1059,14 @@ def write_cid_reference(self, cid_ref_abs_path, pid): # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) return + except Exception as err: exception_string = ( "FileHashStore - _write_cid_reference: failed to write reference for cid:" + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) - raise IOError(exception_string) from err + raise err def update_cid_reference(self, cid_ref_abs_path, pid): """Update an existing cid reference file with the given pid. Every pid in a reference @@ -1099,13 +1100,57 @@ def update_cid_reference(self, cid_ref_abs_path, pid): # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) return + except Exception as err: exception_string = ( "FileHashStore - update_cid_reference: failed to update reference for cid:" + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) - raise IOError(exception_string) from err + raise err + + def delete_cid_reference_pid(self, cid_ref_abs_path, pid): + """Delete a pid in a cid reference file. + + Args: + cid_ref_abs_path (string): Absolute path to the cid ref file + pid (string): Authority-based or persistent identifier of object + """ + info_msg = ( + f"FileHashStore - delete_cid_reference_pid: Deleting pid ({pid}) from cid reference" + + f" file: {cid_ref_abs_path}" + ) + logging.info(info_msg) + + try: + with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: + fcntl.flock(cid_ref_file, fcntl.LOCK_EX) + # Read the ref file to see if the pid is already referencing the cid + cid_ref_file_content = cid_ref_file.read() + + if pid not in cid_ref_file_content: + err_msg = ( + f"FileHashStore - delete_cid_reference_pid: pid ({pid}) does not exist in" + + f" cid reference file: {cid_ref_abs_path} " + ) + raise ValueError(err_msg) + + with open(cid_ref_abs_path, "w", encoding="utf8") as cid_ref_file: + fcntl.flock(cid_ref_file, fcntl.LOCK_EX) + cid_ref_file.write(cid_ref_file_content.replace(pid + "\n", "")) + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) + + return + + except Exception as err: + exception_string = ( + "FileHashStore - delete_cid_reference_pid: failed to update reference for cid:" + + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err def put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index a69baff5..f3b7d5f9 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -544,7 +544,7 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): def test_write_cid_reference(pids, store): - """Test that write_cid_reference writes a reference file""" + """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -557,7 +557,7 @@ def test_write_cid_reference(pids, store): def test_write_cid_reference_content(pids, store): - """Test that write_cid_reference writes the expected content""" + """Test that write_cid_reference writes the expected content.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -574,7 +574,7 @@ def test_write_cid_reference_content(pids, store): def test_update_cid_reference_content(pids, store): - """Test that update_cid_reference updates the ref file as expected""" + """Test that update_cid_reference updates the ref file as expected.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -594,7 +594,7 @@ def test_update_cid_reference_content(pids, store): def test_update_cid_reference_content_multiple(pids, store): - """Test that update_cid_reference multiple updates""" + """Test that update_cid_reference adds multiple references successfully.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -619,6 +619,45 @@ def test_update_cid_reference_content_multiple(pids, store): assert line_count == 6 +def test_delete_cid_reference_pid(pids, store): + """Test that delete_cid_reference deletes the given pid from the ref file.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_reference(cid_ref_abs_path, pid) + + pid_other = "dou.test.1" + store.update_cid_reference(cid_ref_abs_path, pid_other) + store.delete_cid_reference_pid(cid_ref_abs_path, pid) + + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + print(value) + assert value == pid_other + + +def test_delete_cid_reference_pid_not_found(pids, store): + """Test that delete_cid_reference raises exception when pid not found.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_reference(cid_ref_abs_path, pid) + + pid_other = "dou.test.1" + store.update_cid_reference(cid_ref_abs_path, pid_other) + with pytest.raises(ValueError): + store.delete_cid_reference_pid(cid_ref_abs_path, "dou.not.found.1") + + def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" entity = "metadata" From 1fc158f5fb0a553406dc2d6758dce14d37be1d1f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 11:58:15 -0800 Subject: [PATCH 18/71] Rename refs related method names and update pytests --- src/hashstore/filehashstore.py | 29 ++++++++++++----------- tests/test_filehashstore.py | 42 +++++++++++++++++----------------- 2 files changed, 35 insertions(+), 36 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b1d7bc93..270aec98 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -504,7 +504,7 @@ def tag_object(self, pid, cid): else: # If not, create the cid ref file in '.../refs/cid' and write the pid self.create_path(os.path.dirname(cid_ref_abs_path)) - self.write_cid_reference(cid_ref_abs_path, pid) + self.write_cid_ref_file(cid_ref_abs_path, pid) # Then create the pid ref file in '.../refs/pid' with the cid as its content # TODO: Write the pid ref file that contains the cid finally: @@ -1037,7 +1037,7 @@ def delete_tmp_file(): ) logging.error(exception_string) - def write_cid_reference(self, cid_ref_abs_path, pid): + def write_cid_refs_file(self, cid_ref_abs_path, pid): """Write the reference file for the given content identifier (cid). A reference file contains every pid that references a cid on a new line. @@ -1046,7 +1046,7 @@ def write_cid_reference(self, cid_ref_abs_path, pid): pid (string): Authority-based or persistent identifier of object """ info_msg = ( - f"FileHashStore - _write_cid_reference: Writing pid ({pid}) into cid reference" + f"FileHashStore - write_cid_refs_file: Writing pid ({pid}) into cid reference" + f" file: {cid_ref_abs_path}" ) logging.info(info_msg) @@ -1062,22 +1062,21 @@ def write_cid_reference(self, cid_ref_abs_path, pid): except Exception as err: exception_string = ( - "FileHashStore - _write_cid_reference: failed to write reference for cid:" + "FileHashStore - write_cid_refs_file: failed to write reference for cid:" + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise err - def update_cid_reference(self, cid_ref_abs_path, pid): - """Update an existing cid reference file with the given pid. Every pid in a reference - file is found on its own line. + def update_cid_refs(self, cid_ref_abs_path, pid): + """Update an existing cid reference file with the given pid. Args: cid_ref_abs_path (string): Absolute path to the cid ref file pid (string): Authority-based or persistent identifier of object """ info_msg = ( - f"FileHashStore - update_cid_reference: Adding pid ({pid}) into cid reference" + f"FileHashStore - update_cid_refs: Adding pid ({pid}) into cid reference" + f" file: {cid_ref_abs_path}" ) logging.info(info_msg) @@ -1090,7 +1089,7 @@ def update_cid_reference(self, cid_ref_abs_path, pid): if pid in cid_ref_file_content: err_msg = ( - f"FileHashStore - update_cid_reference: pid ({pid}) already reference in" + f"FileHashStore - update_cid_refs: pid ({pid}) already reference in" + f" cid reference file: {cid_ref_abs_path} " ) raise ValueError(err_msg) @@ -1103,21 +1102,21 @@ def update_cid_reference(self, cid_ref_abs_path, pid): except Exception as err: exception_string = ( - "FileHashStore - update_cid_reference: failed to update reference for cid:" + "FileHashStore - update_cid_refs: failed to update reference for cid:" + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise err - def delete_cid_reference_pid(self, cid_ref_abs_path, pid): - """Delete a pid in a cid reference file. + def delete_cid_refs_pid(self, cid_ref_abs_path, pid): + """Delete a pid from a cid reference file. Args: cid_ref_abs_path (string): Absolute path to the cid ref file pid (string): Authority-based or persistent identifier of object """ info_msg = ( - f"FileHashStore - delete_cid_reference_pid: Deleting pid ({pid}) from cid reference" + f"FileHashStore - delete_cid_refs_pid: Deleting pid ({pid}) from cid reference" + f" file: {cid_ref_abs_path}" ) logging.info(info_msg) @@ -1130,7 +1129,7 @@ def delete_cid_reference_pid(self, cid_ref_abs_path, pid): if pid not in cid_ref_file_content: err_msg = ( - f"FileHashStore - delete_cid_reference_pid: pid ({pid}) does not exist in" + f"FileHashStore - delete_cid_refs_pid: pid ({pid}) does not exist in" + f" cid reference file: {cid_ref_abs_path} " ) raise ValueError(err_msg) @@ -1146,7 +1145,7 @@ def delete_cid_reference_pid(self, cid_ref_abs_path, pid): except Exception as err: exception_string = ( - "FileHashStore - delete_cid_reference_pid: failed to update reference for cid:" + "FileHashStore - delete_cid_refs_pid: failed to update reference for cid:" + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index f3b7d5f9..472f29e1 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -543,7 +543,7 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): input_stream.close() -def test_write_cid_reference(pids, store): +def test_write_cid_ref_file(pids, store): """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): entity = "refs" @@ -552,12 +552,12 @@ def test_write_cid_reference(pids, store): "/refs/", "/refs/cid/" ) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_reference(cid_ref_abs_path, pid) + store.write_cid_refs_file(cid_ref_abs_path, pid) assert os.path.exists(cid_ref_abs_path) -def test_write_cid_reference_content(pids, store): - """Test that write_cid_reference writes the expected content.""" +def test_write_cid_ref_file_content(pids, store): + """Test that write_cid_ref_file writes the expected content.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -565,7 +565,7 @@ def test_write_cid_reference_content(pids, store): "/refs/", "/refs/cid/" ) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_reference(cid_ref_abs_path, pid) + store.write_cid_refs_file(cid_ref_abs_path, pid) with open(cid_ref_abs_path, "r", encoding="utf8") as f: cid_ref_file_pid = f.read() @@ -573,8 +573,8 @@ def test_write_cid_reference_content(pids, store): assert pid == cid_ref_file_pid.replace("\n", "") -def test_update_cid_reference_content(pids, store): - """Test that update_cid_reference updates the ref file as expected.""" +def test_update_cid_ref_content(pids, store): + """Test that update_cid_ref updates the ref file as expected.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -582,10 +582,10 @@ def test_update_cid_reference_content(pids, store): "/refs/", "/refs/cid/" ) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_reference(cid_ref_abs_path, pid) + store.write_cid_refs_file(cid_ref_abs_path, pid) pid_other = "dou.test.1" - store.update_cid_reference(cid_ref_abs_path, pid_other) + store.update_cid_refs(cid_ref_abs_path, pid_other) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): @@ -593,8 +593,8 @@ def test_update_cid_reference_content(pids, store): assert value == pid or value == pid_other -def test_update_cid_reference_content_multiple(pids, store): - """Test that update_cid_reference adds multiple references successfully.""" +def test_update_cid_ref_content_multiple(pids, store): + """Test that update_cid_ref adds multiple references successfully.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -602,11 +602,11 @@ def test_update_cid_reference_content_multiple(pids, store): "/refs/", "/refs/cid/" ) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_reference(cid_ref_abs_path, pid) + store.write_cid_refs_file(cid_ref_abs_path, pid) cid_reference_list = [pid] for i in range(0, 5): - store.update_cid_reference(cid_ref_abs_path, f"dou.test.{i}") + store.update_cid_refs(cid_ref_abs_path, f"dou.test.{i}") cid_reference_list.append(f"dou.test.{i}") line_count = 0 @@ -619,7 +619,7 @@ def test_update_cid_reference_content_multiple(pids, store): assert line_count == 6 -def test_delete_cid_reference_pid(pids, store): +def test_delete_cid_ref_pid(pids, store): """Test that delete_cid_reference deletes the given pid from the ref file.""" for pid in pids.keys(): entity = "refs" @@ -628,11 +628,11 @@ def test_delete_cid_reference_pid(pids, store): "/refs/", "/refs/cid/" ) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_reference(cid_ref_abs_path, pid) + store.write_cid_refs_file(cid_ref_abs_path, pid) pid_other = "dou.test.1" - store.update_cid_reference(cid_ref_abs_path, pid_other) - store.delete_cid_reference_pid(cid_ref_abs_path, pid) + store.update_cid_refs(cid_ref_abs_path, pid_other) + store.delete_cid_refs_pid(cid_ref_abs_path, pid) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): @@ -641,7 +641,7 @@ def test_delete_cid_reference_pid(pids, store): assert value == pid_other -def test_delete_cid_reference_pid_not_found(pids, store): +def test_delete_cid_ref_pid_pid_not_found(pids, store): """Test that delete_cid_reference raises exception when pid not found.""" for pid in pids.keys(): entity = "refs" @@ -650,12 +650,12 @@ def test_delete_cid_reference_pid_not_found(pids, store): "/refs/", "/refs/cid/" ) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_reference(cid_ref_abs_path, pid) + store.write_cid_refs_file(cid_ref_abs_path, pid) pid_other = "dou.test.1" - store.update_cid_reference(cid_ref_abs_path, pid_other) + store.update_cid_refs(cid_ref_abs_path, pid_other) with pytest.raises(ValueError): - store.delete_cid_reference_pid(cid_ref_abs_path, "dou.not.found.1") + store.delete_cid_refs_pid(cid_ref_abs_path, "dou.not.found.1") def test_put_metadata_with_path(pids, store): From b7833f0639bd7084e51e2bbf4e4ed15082d751fa Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 12:15:46 -0800 Subject: [PATCH 19/71] Add new 'delete_cid_refs_file' method with new pytests --- src/hashstore/filehashstore.py | 40 ++++++++++++++++++++++++++++++- tests/test_filehashstore.py | 44 ++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 270aec98..3d94277c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -504,7 +504,7 @@ def tag_object(self, pid, cid): else: # If not, create the cid ref file in '.../refs/cid' and write the pid self.create_path(os.path.dirname(cid_ref_abs_path)) - self.write_cid_ref_file(cid_ref_abs_path, pid) + self.write_cid_refs_file(cid_ref_abs_path, pid) # Then create the pid ref file in '.../refs/pid' with the cid as its content # TODO: Write the pid ref file that contains the cid finally: @@ -1151,6 +1151,44 @@ def delete_cid_refs_pid(self, cid_ref_abs_path, pid): logging.error(exception_string) raise err + def delete_cid_refs_file(self, cid_ref_abs_path): + """Delete a cid reference file. There must be no references remaining. + + Args: + cid_ref_abs_path (string): Absolute path to the cid ref file + pid (string): Authority-based or persistent identifier of object + """ + info_msg = ( + "FileHashStore - delete_cid_refs_file: Deleting reference file: %s", + cid_ref_abs_path, + ) + logging.info(info_msg) + + try: + if not os.path.exists(cid_ref_abs_path): + err_msg = ( + "FileHashStore - delete_cid_refs_file: Cid reference file not found: %s", + cid_ref_abs_path, + ) + raise FileNotFoundError(err_msg) + if os.path.getsize(cid_ref_abs_path) != 0: + err_msg = ( + "FileHashStore - delete_cid_refs_file: Failed to delete cid reference file." + + f" File is not empty: {cid_ref_abs_path} " + ) + raise OSError(err_msg) + else: + os.remove(cid_ref_abs_path) + return + + except Exception as err: + exception_string = ( + "FileHashStore - delete_cid_refs_file: failed to delete reference file:" + + f" {cid_ref_abs_path}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err + def put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the given pid and format_id as the permanent address. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 472f29e1..caee71fe 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -658,6 +658,50 @@ def test_delete_cid_ref_pid_pid_not_found(pids, store): store.delete_cid_refs_pid(cid_ref_abs_path, "dou.not.found.1") +def test_delete_cid_ref_pid_file(pids, store): + """Test that delete_cid_refs_file deletes a reference file.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + store.delete_cid_refs_pid(cid_ref_abs_path, pid) + store.delete_cid_refs_file(cid_ref_abs_path) + + assert not os.path.exists(cid_ref_abs_path) + + +def test_delete_cid_ref_pid_file_not_empty(pids, store): + """Test that delete_cid_refs_file raises an exception when refs file not empty.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + + with pytest.raises(OSError): + store.delete_cid_refs_file(cid_ref_abs_path) + + +def test_delete_cid_ref_pid_file_not_found(pids, store): + """Test that delete_cid_refs_file raises an exception when refs file not found.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.build_abs_path(entity, cid).replace( + "/refs/", "/refs/cid/" + ) + + with pytest.raises(FileNotFoundError): + store.delete_cid_refs_file(cid_ref_abs_path) + + def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" entity = "metadata" From d4e8274d1bfcaefcccd4dc6b704144eb7976cfe1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 12:24:55 -0800 Subject: [PATCH 20/71] Add missing docstring for 'tag_object' --- src/hashstore/filehashstore.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3d94277c..41fc822e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -478,6 +478,12 @@ def store_object( return object_metadata def tag_object(self, pid, cid): + """Tag an object that has been stored with a pid reference. + + Args: + pid (string): Authority-based or persistent identifier of object + cid (string): Content identifier + """ # Wait for the cid to release if it's being tagged while cid in self.reference_locked_cids: logging.debug( @@ -500,7 +506,7 @@ def tag_object(self, pid, cid): ) if os.path.exists(cid_ref_abs_path): # If it does, read the file and add the new pid on its own line - print("Add pid to reference file") + self.update_cid_refs(cid_ref_abs_path, pid) else: # If not, create the cid ref file in '.../refs/cid' and write the pid self.create_path(os.path.dirname(cid_ref_abs_path)) From 28d46718ec67ea5c0042965b22343c33cc8115e3 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 12:53:03 -0800 Subject: [PATCH 21/71] Add new 'write_pid_refs_file' method with new pytests --- src/hashstore/filehashstore.py | 52 ++++++++++++++++++++++++++++++- tests/test_filehashstore.py | 56 ++++++++++++++++++++++++++-------- 2 files changed, 95 insertions(+), 13 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 41fc822e..3b6b0fb9 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -484,6 +484,7 @@ def tag_object(self, pid, cid): pid (string): Authority-based or persistent identifier of object cid (string): Content identifier """ + # TODO: Write tests for this method # Wait for the cid to release if it's being tagged while cid in self.reference_locked_cids: logging.debug( @@ -512,7 +513,11 @@ def tag_object(self, pid, cid): self.create_path(os.path.dirname(cid_ref_abs_path)) self.write_cid_refs_file(cid_ref_abs_path, pid) # Then create the pid ref file in '.../refs/pid' with the cid as its content - # TODO: Write the pid ref file that contains the cid + pid_hash = self.computehash(pid, self.algorithm) + pid_ref_abs_path = self.build_abs_path(entity, pid_hash).replace( + "/refs/", "/refs/pid/" + ) + self.write_pid_refs_file(pid_ref_abs_path, cid) finally: # Release cid with self.reference_lock: @@ -1195,6 +1200,51 @@ def delete_cid_refs_file(self, cid_ref_abs_path): logging.error(exception_string) raise err + def write_pid_refs_file(self, pid_ref_abs_path, cid): + """Write the reference file for the given pid (persistent identifier). A reference + file for a pid contains the cid that it references. Its permanent address is the pid + hash with HashStore's default store algorithm and follows its directory structure. + + Args: + pid_ref_abs_path (string): Absolute path to the pid ref file + cid (string): Content identifier + """ + info_msg = ( + f"FileHashStore - write_pid_refs_file: Writing cid ({cid}) into pid reference" + + f" file: {pid_ref_abs_path}" + ) + logging.info(info_msg) + + if os.path.exists(pid_ref_abs_path): + with open(pid_ref_abs_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + if pid_refs_cid == cid: + return + else: + exception_string = ( + "FileHashStore - write_pid_refs_file: pid reference file exists but" + + f" cid ({cid}) is different from cid stored ({pid_refs_cid})." + ) + logging.error(exception_string) + raise ValueError(exception_string) + else: + try: + with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: + fcntl.flock(pid_ref_file, fcntl.LOCK_EX) + pid_ref_file.write(cid) + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) + return + + except Exception as err: + exception_string = ( + "FileHashStore - write_pid_refs_file: failed to write pid reference file:" + + f" {pid_ref_abs_path} for cid: {cid}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err + def put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the given pid and format_id as the permanent address. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index caee71fe..319f0c87 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -543,7 +543,7 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): input_stream.close() -def test_write_cid_ref_file(pids, store): +def test_write_cid_refs_file(pids, store): """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): entity = "refs" @@ -556,7 +556,7 @@ def test_write_cid_ref_file(pids, store): assert os.path.exists(cid_ref_abs_path) -def test_write_cid_ref_file_content(pids, store): +def test_write_cid_refs_file_content(pids, store): """Test that write_cid_ref_file writes the expected content.""" for pid in pids.keys(): entity = "refs" @@ -573,7 +573,7 @@ def test_write_cid_ref_file_content(pids, store): assert pid == cid_ref_file_pid.replace("\n", "") -def test_update_cid_ref_content(pids, store): +def test_update_cid_refs_content(pids, store): """Test that update_cid_ref updates the ref file as expected.""" for pid in pids.keys(): entity = "refs" @@ -593,8 +593,8 @@ def test_update_cid_ref_content(pids, store): assert value == pid or value == pid_other -def test_update_cid_ref_content_multiple(pids, store): - """Test that update_cid_ref adds multiple references successfully.""" +def test_update_cid_refs_content_multiple(pids, store): + """Test that update_cid_refs adds multiple references successfully.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -619,8 +619,8 @@ def test_update_cid_ref_content_multiple(pids, store): assert line_count == 6 -def test_delete_cid_ref_pid(pids, store): - """Test that delete_cid_reference deletes the given pid from the ref file.""" +def test_delete_cid_refs_pid(pids, store): + """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -641,8 +641,8 @@ def test_delete_cid_ref_pid(pids, store): assert value == pid_other -def test_delete_cid_ref_pid_pid_not_found(pids, store): - """Test that delete_cid_reference raises exception when pid not found.""" +def test_delete_cid_refs_pid_pid_not_found(pids, store): + """Test that delete_cid_refs_pid raises exception when pid not found.""" for pid in pids.keys(): entity = "refs" cid = pids[pid]["sha256"] @@ -658,7 +658,7 @@ def test_delete_cid_ref_pid_pid_not_found(pids, store): store.delete_cid_refs_pid(cid_ref_abs_path, "dou.not.found.1") -def test_delete_cid_ref_pid_file(pids, store): +def test_delete_cid_refs_pid_file(pids, store): """Test that delete_cid_refs_file deletes a reference file.""" for pid in pids.keys(): entity = "refs" @@ -674,7 +674,7 @@ def test_delete_cid_ref_pid_file(pids, store): assert not os.path.exists(cid_ref_abs_path) -def test_delete_cid_ref_pid_file_not_empty(pids, store): +def test_delete_cid_refs_pid_file_not_empty(pids, store): """Test that delete_cid_refs_file raises an exception when refs file not empty.""" for pid in pids.keys(): entity = "refs" @@ -689,7 +689,7 @@ def test_delete_cid_ref_pid_file_not_empty(pids, store): store.delete_cid_refs_file(cid_ref_abs_path) -def test_delete_cid_ref_pid_file_not_found(pids, store): +def test_delete_cid_refs_pid_file_not_found(pids, store): """Test that delete_cid_refs_file raises an exception when refs file not found.""" for pid in pids.keys(): entity = "refs" @@ -702,6 +702,38 @@ def test_delete_cid_ref_pid_file_not_found(pids, store): store.delete_cid_refs_file(cid_ref_abs_path) +def test_write_pid_refs_file(pids, store): + """Test that write_pid_refs_file writes a reference file.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + pid_hash = store.computehash(pid, store.algorithm) + pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( + "/refs/", "/refs/pid/" + ) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store.write_pid_refs_file(pid_ref_abs_path, cid) + assert os.path.exists(pid_ref_abs_path) + + +def test_write_pid_refs_file_content(pids, store): + """Test that write_pid_refs_file writes the expected content.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + pid_hash = store.computehash(pid, store.algorithm) + pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( + "/refs/", "/refs/pid/" + ) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store.write_pid_refs_file(pid_ref_abs_path, cid) + + with open(pid_ref_abs_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + + assert cid == pid_refs_cid + + def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" entity = "metadata" From 5689c3c7f961a6d82893053d6ae25ca6e08c4dcf Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 12:57:02 -0800 Subject: [PATCH 22/71] Add new pytestes for 'write_pid_refs_file' method --- tests/test_filehashstore.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 319f0c87..569f0fae 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -734,6 +734,42 @@ def test_write_pid_refs_file_content(pids, store): assert cid == pid_refs_cid +def test_write_pid_refs_file_exists(pids, store): + """Test that write_pid_refs_file returns when ref already exists and the + cid given is the same.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + pid_hash = store.computehash(pid, store.algorithm) + pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( + "/refs/", "/refs/pid/" + ) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store.write_pid_refs_file(pid_ref_abs_path, cid) + # This should not write and return + store.write_pid_refs_file(pid_ref_abs_path, cid) + + with open(pid_ref_abs_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + + assert cid == pid_refs_cid + + +def test_write_pid_refs_file_exists_different_cid(pids, store): + """Test that write_pid_refs_file returns when ref already exists and the + cid given is the same.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + pid_hash = store.computehash(pid, store.algorithm) + pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( + "/refs/", "/refs/pid/" + ) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store.write_pid_refs_file(pid_ref_abs_path, cid) + with pytest.raises(ValueError): + store.write_pid_refs_file(pid_ref_abs_path, "abc123") + def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" entity = "metadata" From 9f5cb601efcb513249031f9bde3d7df25a0d3f98 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 13:04:21 -0800 Subject: [PATCH 23/71] Add new method 'delete_pid_refs_file' with new pytests --- src/hashstore/filehashstore.py | 32 +++++++++++++++++++++++++++++++- tests/test_filehashstore.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3b6b0fb9..a5f64b6c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1167,7 +1167,6 @@ def delete_cid_refs_file(self, cid_ref_abs_path): Args: cid_ref_abs_path (string): Absolute path to the cid ref file - pid (string): Authority-based or persistent identifier of object """ info_msg = ( "FileHashStore - delete_cid_refs_file: Deleting reference file: %s", @@ -1245,6 +1244,37 @@ def write_pid_refs_file(self, pid_ref_abs_path, cid): logging.error(exception_string) raise err + def delete_pid_refs_file(self, pid_ref_abs_path): + """Delete a pid reference file. + + Args: + pid_ref_abs_path (string): Absolute path to the pid ref file + """ + info_msg = ( + "FileHashStore - delete_pid_refs_file: Deleting reference file: %s", + pid_ref_abs_path, + ) + logging.info(info_msg) + + try: + if not os.path.exists(pid_ref_abs_path): + err_msg = ( + "FileHashStore - delete_pid_refs_file: pid reference file not found: %s", + pid_ref_abs_path, + ) + raise FileNotFoundError(err_msg) + else: + os.remove(pid_ref_abs_path) + return + + except Exception as err: + exception_string = ( + "FileHashStore - delete_pid_refs_file: failed to delete reference file:" + + f" {pid_ref_abs_path}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err + def put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the given pid and format_id as the permanent address. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 569f0fae..eb63fcca 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -770,6 +770,36 @@ def test_write_pid_refs_file_exists_different_cid(pids, store): with pytest.raises(ValueError): store.write_pid_refs_file(pid_ref_abs_path, "abc123") + +def test_delete_pid_refs_file(pids, store): + """Test that delete_pid_refs_file deletes a reference file.""" + for pid in pids.keys(): + entity = "refs" + cid = pids[pid]["sha256"] + pid_hash = store.computehash(pid, store.algorithm) + pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( + "/refs/", "/refs/pid/" + ) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store.write_pid_refs_file(pid_ref_abs_path, cid) + store.delete_pid_refs_file(pid_ref_abs_path) + + assert not os.path.exists(pid_ref_abs_path) + + +def test_delete_pid_refs_file_file_not_found(pids, store): + """Test that delete_pid_refs_file raises an exception when refs file not found.""" + for pid in pids.keys(): + entity = "refs" + pid_hash = store.computehash(pid, store.algorithm) + pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( + "/refs/", "/refs/pid/" + ) + + with pytest.raises(FileNotFoundError): + store.delete_cid_refs_file(pid_ref_abs_path) + + def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" entity = "metadata" From 9c6509e16484511a57eb388c07221a601200c3da Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 13:12:43 -0800 Subject: [PATCH 24/71] Update --run-slow pytests --- tests/test_filehashstore_interface.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 329af168..1f0fef3d 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -511,8 +511,7 @@ def test_store_object_large_file(store): pid = "testfile_filehashstore" object_metadata = store.store_object(pid, file_path) object_metadata_id = object_metadata.id - pid_sha256_hex_digest = store.get_sha256_hex_digest(pid) - assert object_metadata_id == pid_sha256_hex_digest + assert object_metadata_id == object_metadata.hex_digests.get("sha256") @slow_test @@ -531,8 +530,7 @@ def test_store_object_sparse_large_file(store): pid = "testfile_filehashstore" object_metadata = store.store_object(pid, file_path) object_metadata_id = object_metadata.id - pid_sha256_hex_digest = store.get_sha256_hex_digest(pid) - assert object_metadata_id == pid_sha256_hex_digest + assert object_metadata_id == object_metadata.hex_digests.get("sha256") def test_store_metadata(pids, store): From 2738d3fb8bcf166160f3055a8caf20fa5145591f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 13:20:27 -0800 Subject: [PATCH 25/71] Code 'find_object' method, missing pytests --- src/hashstore/filehashstore.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index a5f64b6c..8368e0f3 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -531,9 +531,30 @@ def tag_object(self, pid, cid): return def find_object(self, pid): + logging.debug( + "FileHashStore - find_object: Request to find object for for pid: %s", pid + ) + # TODO: Write tests for this method + self._is_string_none_or_empty(pid, "pid", "find_object") + # Get the path to the pid reference by calculating its hash in '.../refs/pid' - # Read the file to get the cid from the pid reference and return it - return + entity = "refs" + pid_hash = self.computehash(pid, self.algorithm) + pid_ref_abs_path = self.build_abs_path(entity, pid_hash).replace( + "/refs/", "/refs/pid/" + ) + if not os.path.exists(pid_ref_abs_path): + err_msg = ( + f"FileHashStore - find_object: pid ({pid}) reference file not found: " + + pid_ref_abs_path, + ) + raise FileNotFoundError(err_msg) + else: + # Read the file to get the cid from the pid reference + with open(pid_ref_abs_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + + return pid_refs_cid def store_metadata(self, pid, metadata, format_id=None): logging.debug( From 2e03a6a01371c76731e0f0ea2e922af38ac1cdd8 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 13:29:46 -0800 Subject: [PATCH 26/71] Fix bug in 'tag_object', refactor 'retrieve_object' method and update pytests --- src/hashstore/filehashstore.py | 8 +++----- tests/test_filehashstore_interface.py | 3 ++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 8368e0f3..b69dffc0 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -517,6 +517,7 @@ def tag_object(self, pid, cid): pid_ref_abs_path = self.build_abs_path(entity, pid_hash).replace( "/refs/", "/refs/pid/" ) + self.create_path(os.path.dirname(pid_ref_abs_path)) self.write_pid_refs_file(pid_ref_abs_path, cid) finally: # Release cid @@ -609,13 +610,10 @@ def retrieve_object(self, pid): ) self._is_string_none_or_empty(pid, "pid", "retrieve_object") - # TODO: Find object from the pid reference file - + object_cid = self.find_object(pid) entity = "objects" - object_cid = self.get_sha256_hex_digest(pid) - object_exists = self.exists(entity, object_cid) - if object_exists: + if object_cid: logging.debug( "FileHashStore - retrieve_object: Metadata exists for pid: %s, retrieving object.", pid, diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 1f0fef3d..1da70497 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -705,6 +705,7 @@ def test_retrieve_object(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) store.store_metadata(pid, syspath, format_id) obj_stream = store.retrieve_object(pid) sha256_hex = store.computehash(obj_stream) @@ -723,7 +724,7 @@ def test_retrieve_object_pid_invalid(store): """Test retrieve_object raises error when supplied with bad pid.""" pid = "jtao.1700.1" pid_does_not_exist = pid + "test" - with pytest.raises(ValueError): + with pytest.raises(FileNotFoundError): store.retrieve_object(pid_does_not_exist) From 5b6a15fca61265c70f5924e8806c9d46d906633a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 13:33:18 -0800 Subject: [PATCH 27/71] Fix retrieve_object pytest in test_hashstore_client --- tests/test_hashstore_client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index f3f24477..999f26ad 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -108,7 +108,8 @@ def test_retrieve_objects(capsys, pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - _object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) client_module_path = f"{client_directory}/client.py" test_store = store.root From e698c4047ff769c44e593f0058a39393c13d7fd5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 11 Nov 2023 13:35:53 -0800 Subject: [PATCH 28/71] Refactor 'get_hex_digest' method and update pytests --- src/hashstore/filehashstore.py | 4 +--- tests/test_filehashstore_interface.py | 5 +++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b69dffc0..7aa2f0c5 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -702,11 +702,9 @@ def get_hex_digest(self, pid, algorithm): self._is_string_none_or_empty(pid, "pid", "get_hex_digest") self._is_string_none_or_empty(algorithm, "algorithm", "get_hex_digest") - # TODO: Find object from the pid reference file - entity = "objects" algorithm = self.clean_algorithm(algorithm) - object_cid = self.get_sha256_hex_digest(pid) + object_cid = self.find_object(pid) if not self.exists(entity, object_cid): exception_string = ( f"FileHashStore - get_hex_digest: No object found for pid: {pid}" diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 1da70497..96c5ebcc 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -884,7 +884,8 @@ def test_get_hex_digest(store): path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) _metadata_cid = store.store_metadata(pid, syspath, format_id) sha3_256_hex_digest = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" @@ -898,7 +899,7 @@ def test_get_hex_digest_pid_not_found(store): pid = "jtao.1700.1" pid_does_not_exist = pid + "test" algorithm = "sha256" - with pytest.raises(ValueError): + with pytest.raises(FileNotFoundError): store.get_hex_digest(pid_does_not_exist, algorithm) From 81bb767f3134e6bfcf77cd0660b839c5238c714d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 09:42:22 -0800 Subject: [PATCH 29/71] Initial refactor to 'delete_object' to find correct object to delete and update pytests --- src/hashstore/filehashstore.py | 6 +----- tests/test_filehashstore_interface.py | 3 ++- tests/test_hashstore_client.py | 3 ++- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7aa2f0c5..ebfa0c8b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -662,12 +662,8 @@ def delete_object(self, pid): ) self._is_string_none_or_empty(pid, "pid", "delete_object") - # TODO: Also find the reference file and delete it if there's only one ref - # Else delete the pid in the cid refs file - # Also delete the pid ref file - entity = "objects" - object_cid = self.get_sha256_hex_digest(pid) + object_cid = self.find_object(pid) self.delete(entity, object_cid) logging.info( diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 96c5ebcc..316d1676 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -803,7 +803,8 @@ def test_delete_objects(pids, store): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) assert store.count(entity) == 0 diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 999f26ad..7d1e01a0 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -189,7 +189,8 @@ def test_delete_objects(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - _object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) client_module_path = f"{client_directory}/client.py" test_store = store.root From ccaa768538968f0ad6d1243fad5061cdf231526e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 10:11:41 -0800 Subject: [PATCH 30/71] Add new method 'get_refs_abs_path' and refactor 'FileHashStore' and pytests --- src/hashstore/filehashstore.py | 38 ++++++++++------ tests/test_filehashstore.py | 83 ++++++---------------------------- 2 files changed, 39 insertions(+), 82 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index ebfa0c8b..ec4316de 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -501,10 +501,7 @@ def tag_object(self, pid, cid): self.reference_locked_cids.append(cid) try: # Check to see if reference file already exists for the cid - entity = "refs" - cid_ref_abs_path = self.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = self.get_refs_abs_path("cid", cid) if os.path.exists(cid_ref_abs_path): # If it does, read the file and add the new pid on its own line self.update_cid_refs(cid_ref_abs_path, pid) @@ -513,10 +510,7 @@ def tag_object(self, pid, cid): self.create_path(os.path.dirname(cid_ref_abs_path)) self.write_cid_refs_file(cid_ref_abs_path, pid) # Then create the pid ref file in '.../refs/pid' with the cid as its content - pid_hash = self.computehash(pid, self.algorithm) - pid_ref_abs_path = self.build_abs_path(entity, pid_hash).replace( - "/refs/", "/refs/pid/" - ) + pid_ref_abs_path = self.get_refs_abs_path("pid", pid) self.create_path(os.path.dirname(pid_ref_abs_path)) self.write_pid_refs_file(pid_ref_abs_path, cid) finally: @@ -538,12 +532,7 @@ def find_object(self, pid): # TODO: Write tests for this method self._is_string_none_or_empty(pid, "pid", "find_object") - # Get the path to the pid reference by calculating its hash in '.../refs/pid' - entity = "refs" - pid_hash = self.computehash(pid, self.algorithm) - pid_ref_abs_path = self.build_abs_path(entity, pid_hash).replace( - "/refs/", "/refs/pid/" - ) + pid_ref_abs_path = self.get_refs_abs_path("pid", pid) if not os.path.exists(pid_ref_abs_path): err_msg = ( f"FileHashStore - find_object: pid ({pid}) reference file not found: " @@ -662,6 +651,10 @@ def delete_object(self, pid): ) self._is_string_none_or_empty(pid, "pid", "delete_object") + # Remove pid from cid reference file + # self.delete_cid_refs_pid(, pid) + # Delete cid reference file if it's empty + # Delete pid reference file entity = "objects" object_cid = self.find_object(pid) self.delete(entity, object_cid) @@ -1836,6 +1829,23 @@ def build_abs_path(self, entity, hash_id, extension=""): absolute_path = os.path.join(root_dir, *paths) + extension return absolute_path + def get_refs_abs_path(self, ref_type, pid): + """Get the absolute path to the reference file for the given pid. + + Args: + ref_type (string): 'pid' or 'cid' + pid (string): Authority-based or persistent identifier + + Returns: + ref_file_abs_path (string): Path to the ref file for the given type and pid + """ + entity = "refs" + pid_hash = self.computehash(pid, self.algorithm) + ref_file_abs_path = self.build_abs_path(entity, pid_hash).replace( + "/refs/", f"/refs/{ref_type}/" + ) + return ref_file_abs_path + def count(self, entity): """Return count of the number of files in the `root` directory. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index eb63fcca..9e546f5b 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -546,11 +546,8 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): def test_write_cid_refs_file(pids, store): """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) assert os.path.exists(cid_ref_abs_path) @@ -559,11 +556,8 @@ def test_write_cid_refs_file(pids, store): def test_write_cid_refs_file_content(pids, store): """Test that write_cid_ref_file writes the expected content.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) @@ -576,11 +570,8 @@ def test_write_cid_refs_file_content(pids, store): def test_update_cid_refs_content(pids, store): """Test that update_cid_ref updates the ref file as expected.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) @@ -596,11 +587,8 @@ def test_update_cid_refs_content(pids, store): def test_update_cid_refs_content_multiple(pids, store): """Test that update_cid_refs adds multiple references successfully.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) @@ -622,11 +610,8 @@ def test_update_cid_refs_content_multiple(pids, store): def test_delete_cid_refs_pid(pids, store): """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) @@ -644,11 +629,8 @@ def test_delete_cid_refs_pid(pids, store): def test_delete_cid_refs_pid_pid_not_found(pids, store): """Test that delete_cid_refs_pid raises exception when pid not found.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) @@ -661,11 +643,8 @@ def test_delete_cid_refs_pid_pid_not_found(pids, store): def test_delete_cid_refs_pid_file(pids, store): """Test that delete_cid_refs_file deletes a reference file.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) store.delete_cid_refs_pid(cid_ref_abs_path, pid) @@ -677,11 +656,8 @@ def test_delete_cid_refs_pid_file(pids, store): def test_delete_cid_refs_pid_file_not_empty(pids, store): """Test that delete_cid_refs_file raises an exception when refs file not empty.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) @@ -692,12 +668,8 @@ def test_delete_cid_refs_pid_file_not_empty(pids, store): def test_delete_cid_refs_pid_file_not_found(pids, store): """Test that delete_cid_refs_file raises an exception when refs file not found.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - cid_ref_abs_path = store.build_abs_path(entity, cid).replace( - "/refs/", "/refs/cid/" - ) - + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) with pytest.raises(FileNotFoundError): store.delete_cid_refs_file(cid_ref_abs_path) @@ -705,12 +677,8 @@ def test_delete_cid_refs_pid_file_not_found(pids, store): def test_write_pid_refs_file(pids, store): """Test that write_pid_refs_file writes a reference file.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - pid_hash = store.computehash(pid, store.algorithm) - pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( - "/refs/", "/refs/pid/" - ) + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store.write_pid_refs_file(pid_ref_abs_path, cid) assert os.path.exists(pid_ref_abs_path) @@ -719,12 +687,8 @@ def test_write_pid_refs_file(pids, store): def test_write_pid_refs_file_content(pids, store): """Test that write_pid_refs_file writes the expected content.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - pid_hash = store.computehash(pid, store.algorithm) - pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( - "/refs/", "/refs/pid/" - ) + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store.write_pid_refs_file(pid_ref_abs_path, cid) @@ -738,12 +702,8 @@ def test_write_pid_refs_file_exists(pids, store): """Test that write_pid_refs_file returns when ref already exists and the cid given is the same.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - pid_hash = store.computehash(pid, store.algorithm) - pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( - "/refs/", "/refs/pid/" - ) + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store.write_pid_refs_file(pid_ref_abs_path, cid) # This should not write and return @@ -759,12 +719,8 @@ def test_write_pid_refs_file_exists_different_cid(pids, store): """Test that write_pid_refs_file returns when ref already exists and the cid given is the same.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - pid_hash = store.computehash(pid, store.algorithm) - pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( - "/refs/", "/refs/pid/" - ) + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store.write_pid_refs_file(pid_ref_abs_path, cid) with pytest.raises(ValueError): @@ -774,12 +730,8 @@ def test_write_pid_refs_file_exists_different_cid(pids, store): def test_delete_pid_refs_file(pids, store): """Test that delete_pid_refs_file deletes a reference file.""" for pid in pids.keys(): - entity = "refs" cid = pids[pid]["sha256"] - pid_hash = store.computehash(pid, store.algorithm) - pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( - "/refs/", "/refs/pid/" - ) + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store.write_pid_refs_file(pid_ref_abs_path, cid) store.delete_pid_refs_file(pid_ref_abs_path) @@ -790,12 +742,7 @@ def test_delete_pid_refs_file(pids, store): def test_delete_pid_refs_file_file_not_found(pids, store): """Test that delete_pid_refs_file raises an exception when refs file not found.""" for pid in pids.keys(): - entity = "refs" - pid_hash = store.computehash(pid, store.algorithm) - pid_ref_abs_path = store.build_abs_path(entity, pid_hash).replace( - "/refs/", "/refs/pid/" - ) - + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) with pytest.raises(FileNotFoundError): store.delete_cid_refs_file(pid_ref_abs_path) From 8d44f42eb39e5204a8fff6c6a31d0df585ba177a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 10:31:03 -0800 Subject: [PATCH 31/71] Delete redundant 'get_sha256_hex_digest' method and refactor FileHashStore class and pytests --- src/hashstore/filehashstore.py | 26 +++++++------------------- tests/test_filehashstore.py | 7 ------- 2 files changed, 7 insertions(+), 26 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index ec4316de..9c9203ee 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -629,7 +629,7 @@ def retrieve_metadata(self, pid, format_id=None): checked_format_id = self._validate_format_id(format_id, "retrieve_metadata") entity = "metadata" - metadata_cid = self.get_sha256_hex_digest(pid + checked_format_id) + metadata_cid = self.computehash(pid + checked_format_id) metadata_exists = self.exists(entity, metadata_cid) if metadata_exists: metadata_stream = self.open(entity, metadata_cid) @@ -674,7 +674,7 @@ def delete_metadata(self, pid, format_id=None): checked_format_id = self._validate_format_id(format_id, "delete_metadata") entity = "metadata" - metadata_cid = self.get_sha256_hex_digest(pid + checked_format_id) + metadata_cid = self.computehash(pid + checked_format_id) self.delete(entity, metadata_cid) logging.info( @@ -1302,7 +1302,7 @@ def put_metadata(self, metadata, pid, format_id): metadata_tmp = self._mktmpmetadata(metadata_stream) # Get target and related paths (permanent location) - metadata_cid = self.get_sha256_hex_digest(pid + format_id) + metadata_cid = self.computehash(pid + format_id) rel_path = "/".join(self.shard(metadata_cid)) full_path = self.get_store_path("metadata") / rel_path @@ -1602,11 +1602,12 @@ def clean_algorithm(self, algorithm_string): return cleaned_string def computehash(self, stream, algorithm=None): - """Compute hash of a file-like object using :attr:`algorithm` by default - or with optional algorithm supported. + """Compute the hash of a file-like object (or string) using :attr:`algorithm` by + default or with optional algorithm supported. Args: - stream (io.BufferedReader): A buffered stream of an object_cid object. \n + stream (mixed): A buffered stream (io.BufferedReader) of an object. A string is + also acceptable as they are a sequence of characters (Python only).\n algorithm (string): Algorithm of hex digest to generate. Returns: @@ -1926,19 +1927,6 @@ def _to_bytes(text): text = bytes(text, "utf8") return text - @staticmethod - def get_sha256_hex_digest(string): - """Calculate the SHA-256 digest of a UTF-8 encoded string. - - Args: - string (string): String to convert. - - Returns: - hex (string): Hexadecimal string. - """ - hex_digest = hashlib.sha256(string.encode("utf-8")).hexdigest() - return hex_digest - class Stream(object): """Common interface for file-like objects. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 9e546f5b..526444cb 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1090,10 +1090,3 @@ def test_to_bytes(store): # pylint: disable=W0212 string_bytes = store._to_bytes(string) assert isinstance(string_bytes, bytes) - - -def test_get_sha256_hex_digest(pids, store): - """Test for correct sha256 return value.""" - for pid in pids: - hash_val = store.get_sha256_hex_digest(pid) - assert hash_val == pids[pid]["object_cid"] From 1a921a0937d0786130f14687470227a4c2c8f8ad Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 10:59:10 -0800 Subject: [PATCH 32/71] Refactor 'delete_object' to delete all required pid or cid reference files --- src/hashstore/filehashstore.py | 36 ++++++++++++++++++++++------------ tests/test_filehashstore.py | 11 ++++++----- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 9c9203ee..bf52019e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -652,12 +652,19 @@ def delete_object(self, pid): self._is_string_none_or_empty(pid, "pid", "delete_object") # Remove pid from cid reference file - # self.delete_cid_refs_pid(, pid) - # Delete cid reference file if it's empty + cid = self.find_object(pid) + cid_ref_abs_path = self.get_refs_abs_path("cid", cid) + self.delete_cid_refs_pid(cid_ref_abs_path, pid) + # Delete cid reference file + # If the file is not empty, it will not be deleted. + cid_refs_deleted = self.delete_cid_refs_file(cid_ref_abs_path) # Delete pid reference file - entity = "objects" - object_cid = self.find_object(pid) - self.delete(entity, object_cid) + pid_ref_abs_path = self.get_refs_abs_path("pid", pid) + self.delete_pid_refs_file(pid_ref_abs_path) + # Finally, delete the object + if cid_refs_deleted: + entity = "objects" + self.delete(entity, cid) logging.info( "FileHashStore - delete_object: Successfully deleted object for pid: %s", @@ -1173,6 +1180,9 @@ def delete_cid_refs_file(self, cid_ref_abs_path): Args: cid_ref_abs_path (string): Absolute path to the cid ref file + + Returns: + boolean: True if deleted, False if not """ info_msg = ( "FileHashStore - delete_cid_refs_file: Deleting reference file: %s", @@ -1188,14 +1198,15 @@ def delete_cid_refs_file(self, cid_ref_abs_path): ) raise FileNotFoundError(err_msg) if os.path.getsize(cid_ref_abs_path) != 0: - err_msg = ( + warn_msg = ( "FileHashStore - delete_cid_refs_file: Failed to delete cid reference file." + f" File is not empty: {cid_ref_abs_path} " ) - raise OSError(err_msg) + logging.warning(warn_msg) + return False else: os.remove(cid_ref_abs_path) - return + return True except Exception as err: exception_string = ( @@ -1830,19 +1841,20 @@ def build_abs_path(self, entity, hash_id, extension=""): absolute_path = os.path.join(root_dir, *paths) + extension return absolute_path - def get_refs_abs_path(self, ref_type, pid): + def get_refs_abs_path(self, ref_type, hash_id): """Get the absolute path to the reference file for the given pid. Args: ref_type (string): 'pid' or 'cid' - pid (string): Authority-based or persistent identifier + hash_id (string): Authority-based, persistent or hash identifier Returns: ref_file_abs_path (string): Path to the ref file for the given type and pid """ entity = "refs" - pid_hash = self.computehash(pid, self.algorithm) - ref_file_abs_path = self.build_abs_path(entity, pid_hash).replace( + if ref_type is "pid": + hash_id = self.computehash(hash_id, self.algorithm) + ref_file_abs_path = self.build_abs_path(entity, hash_id).replace( "/refs/", f"/refs/{ref_type}/" ) return ref_file_abs_path diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 526444cb..83cd1e62 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -648,21 +648,22 @@ def test_delete_cid_refs_pid_file(pids, store): store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) store.delete_cid_refs_pid(cid_ref_abs_path, pid) - store.delete_cid_refs_file(cid_ref_abs_path) + cid_refs_deleted = store.delete_cid_refs_file(cid_ref_abs_path) + assert cid_refs_deleted assert not os.path.exists(cid_ref_abs_path) def test_delete_cid_refs_pid_file_not_empty(pids, store): - """Test that delete_cid_refs_file raises an exception when refs file not empty.""" + """Test that delete_cid_refs_file does not raise an exception when refs file + is not empty.""" for pid in pids.keys(): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store.write_cid_refs_file(cid_ref_abs_path, pid) - - with pytest.raises(OSError): - store.delete_cid_refs_file(cid_ref_abs_path) + cid_refs_deleted = store.delete_cid_refs_file(cid_ref_abs_path) + assert not cid_refs_deleted def test_delete_cid_refs_pid_file_not_found(pids, store): From 4fee82323e0a02ded076dddf58a72c87943777fa Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 11:44:39 -0800 Subject: [PATCH 33/71] Synchronized 'delete_object' method with 'tag_object' method on cid value --- src/hashstore/filehashstore.py | 71 ++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 28 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index bf52019e..795b1f28 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -478,17 +478,11 @@ def store_object( return object_metadata def tag_object(self, pid, cid): - """Tag an object that has been stored with a pid reference. - - Args: - pid (string): Authority-based or persistent identifier of object - cid (string): Content identifier - """ # TODO: Write tests for this method # Wait for the cid to release if it's being tagged while cid in self.reference_locked_cids: logging.debug( - "FileHashStore - tag_object: (cid) %s is currently being tagged. Waiting.", + "FileHashStore - tag_object: (cid) %s is currently locked. Waiting.", cid, ) time.sleep(self.time_out_sec) @@ -650,27 +644,49 @@ def delete_object(self, pid): "FileHashStore - delete_object: Request to delete object for pid: %s", pid ) self._is_string_none_or_empty(pid, "pid", "delete_object") - - # Remove pid from cid reference file cid = self.find_object(pid) - cid_ref_abs_path = self.get_refs_abs_path("cid", cid) - self.delete_cid_refs_pid(cid_ref_abs_path, pid) - # Delete cid reference file - # If the file is not empty, it will not be deleted. - cid_refs_deleted = self.delete_cid_refs_file(cid_ref_abs_path) - # Delete pid reference file - pid_ref_abs_path = self.get_refs_abs_path("pid", pid) - self.delete_pid_refs_file(pid_ref_abs_path) - # Finally, delete the object - if cid_refs_deleted: - entity = "objects" - self.delete(entity, cid) - logging.info( - "FileHashStore - delete_object: Successfully deleted object for pid: %s", - pid, - ) - return True + while cid in self.reference_locked_cids: + logging.debug( + "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", + cid, + ) + time.sleep(self.time_out_sec) + # Modify reference_locked_cids consecutively + with self.reference_lock: + logging.debug( + "FileHashStore - delete_object: Adding cid: %s to reference_locked_cids.", + cid, + ) + self.reference_locked_cids.append(cid) + try: + # Remove pid from cid reference file + cid_ref_abs_path = self.get_refs_abs_path("cid", cid) + self.delete_cid_refs_pid(cid_ref_abs_path, pid) + # Delete cid reference file + # If the file is not empty, it will not be deleted. + cid_refs_deleted = self.delete_cid_refs_file(cid_ref_abs_path) + # Delete pid reference file + pid_ref_abs_path = self.get_refs_abs_path("pid", pid) + self.delete_pid_refs_file(pid_ref_abs_path) + # Finally, delete the object + if cid_refs_deleted: + entity = "objects" + self.delete(entity, cid) + return True + finally: + # Release cid + with self.reference_lock: + logging.debug( + "FileHashStore - delete_object: Removing cid: %s from reference_locked_cids.", + cid, + ) + self.reference_locked_cids.remove(cid) + info_msg = ( + "FileHashStore - delete_object: Successfully deleted references and/or" + + f" objects associated with pid: {pid}" + ) + logging.info(info_msg) def delete_metadata(self, pid, format_id=None): logging.debug( @@ -1063,7 +1079,7 @@ def delete_tmp_file(): def write_cid_refs_file(self, cid_ref_abs_path, pid): """Write the reference file for the given content identifier (cid). A reference - file contains every pid that references a cid on a new line. + file contains every pid that references a cid each on its own line. Args: cid_ref_abs_path (string): Absolute path to the cid ref file @@ -1164,7 +1180,6 @@ def delete_cid_refs_pid(self, cid_ref_abs_path, pid): # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) - return except Exception as err: From 6316044bc8e0e7930533827f80bfa98afcd8b639 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 11:55:23 -0800 Subject: [PATCH 34/71] Add new pytests for 'delete_object' --- tests/test_filehashstore_interface.py | 54 ++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 316d1676..dda3b4c3 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -795,7 +795,7 @@ def test_retrieve_metadata_format_id_empty_spaces(store): def test_delete_objects(pids, store): - """Test delete_object successfully deletes objects.""" + """Test delete_object successfully deletes objects from /objects.""" test_dir = "tests/testdata/" entity = "objects" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -810,6 +810,58 @@ def test_delete_objects(pids, store): assert store.count(entity) == 0 +def test_delete_objects_pid_refs_file(pids, store): + """Test delete_object deletes the pid refs file containing the cid.""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + store.delete_object(pid) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + assert not os.path.exists(pid_refs_file_path) + + +def test_delete_objects_cid_refs_file(pids, store): + """Test delete_object deletes the cid refs file containing the cid.""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + cid = object_metadata.id + store.tag_object(pid, cid) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + store.delete_object(pid) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert not os.path.exists(cid_refs_file_path) + + +def test_delete_objects_cid_refs_file_with_pid_refs_remaining(pids, store): + """Test delete_object does not delete the cid refs file that still contains ref.""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + cid = object_metadata.id + store.tag_object(pid, cid) + cid_refs_abs_path = store.get_refs_abs_path("cid", cid) + store.update_cid_refs(cid_refs_abs_path, "dou.test.1") + _metadata_cid = store.store_metadata(pid, syspath, format_id) + store.delete_object(pid) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert os.path.exists(cid_refs_file_path) + + def test_delete_object_pid_empty(store): """Test delete_object raises error when empty pid supplied.""" pid = " " From e5b60aea718fae3f7e9dd938521e5fbc67b56be2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 14:03:05 -0800 Subject: [PATCH 35/71] Add pytests for 'find_object' method --- src/hashstore/filehashstore.py | 1 - tests/test_filehashstore_interface.py | 29 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 795b1f28..0753572b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -523,7 +523,6 @@ def find_object(self, pid): logging.debug( "FileHashStore - find_object: Request to find object for for pid: %s", pid ) - # TODO: Write tests for this method self._is_string_none_or_empty(pid, "pid", "find_object") pid_ref_abs_path = self.get_refs_abs_path("pid", pid) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index dda3b4c3..60718839 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -533,6 +533,35 @@ def test_store_object_sparse_large_file(store): assert object_metadata_id == object_metadata.hex_digests.get("sha256") +def test_find_object(pids, store): + """Test find object returns the correct content identifier (cid).""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + cid = store.find_object(pid) + assert cid == object_metadata.hex_digests.get("sha256") + + +def test_find_object_pid_object_does_not_exist(store): + """Test find object throws exception when object doesn't exist.""" + with pytest.raises(FileNotFoundError): + store.find_object("dou.test.1") + + +def test_find_object_pid_none(store): + """Test find object throws exception when pid is None.""" + with pytest.raises(ValueError): + store.find_object(None) + + +def test_find_object_pid_empty(store): + """Test find object throws exception when pid is empty.""" + with pytest.raises(ValueError): + store.find_object("") + + def test_store_metadata(pids, store): """Test store metadata.""" test_dir = "tests/testdata/" From 7242a62e22595fa8deca603402711ca8fa8ee9fe Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 14:09:33 -0800 Subject: [PATCH 36/71] Clean up 'filehashstore_interface' pytests --- tests/test_filehashstore_interface.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 60718839..af957c8d 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -34,13 +34,9 @@ def test_store_object(pids, store): """Test store object.""" test_dir = "tests/testdata/" entity = "objects" - format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) assert object_metadata.id == pids[pid][store.algorithm] assert store.count(entity) == 3 @@ -49,13 +45,9 @@ def test_store_object_files_path(pids, store): """Test store object when given a path.""" test_dir = "tests/testdata/" entity = "objects" - format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) assert store.exists(entity, pids[pid][store.algorithm]) assert store.count(entity) == 3 @@ -64,13 +56,9 @@ def test_store_object_files_string(pids, store): """Test store object when given a string.""" test_dir = "tests/testdata/" entity = "objects" - format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path_string = test_dir + pid.replace("/", "_") - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path_string) - _metadata_cid = store.store_metadata(pid, syspath, format_id) assert store.exists(entity, pids[pid][store.algorithm]) assert store.count(entity) == 3 @@ -567,10 +555,8 @@ def test_store_metadata(pids, store): test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) metadata_cid = store.store_metadata(pid, syspath, format_id) assert metadata_cid == pids[pid]["metadata_cid"] @@ -579,10 +565,8 @@ def test_store_metadata_default_format_id(pids, store): """Test store metadata returns expected id when storing with default format_id.""" test_dir = "tests/testdata/" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) metadata_cid = store.store_metadata(pid, syspath) assert metadata_cid == pids[pid]["metadata_cid"] @@ -593,10 +577,8 @@ def test_store_metadata_files_path(pids, store): entity = "metadata" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) metadata_cid = store.store_metadata(pid, syspath, format_id) assert store.exists(entity, metadata_cid) assert metadata_cid == pids[pid]["metadata_cid"] @@ -609,10 +591,8 @@ def test_store_metadata_files_string(pids, store): entity = "metadata" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): - path_string = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) - _object_metadata = store.store_object(pid, path_string) metadata_cid = store.store_metadata(pid, syspath_string, format_id) assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 @@ -624,8 +604,6 @@ def test_store_metadata_files_input_stream(pids, store): entity = "metadata" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - _object_metadata = store.store_object(pid, path) filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) syspath_stream = io.open(syspath_string, "rb") From 180d9710c48614f9ae464b8bd213562d56da33a9 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 12 Nov 2023 14:59:44 -0800 Subject: [PATCH 37/71] Add new pytests for 'tag_object' method --- src/hashstore/filehashstore.py | 24 ++++++--- tests/test_filehashstore_interface.py | 75 +++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 8 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 0753572b..5f61a11a 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -478,7 +478,13 @@ def store_object( return object_metadata def tag_object(self, pid, cid): - # TODO: Write tests for this method + logging.debug( + "FileHashStore - tag_object: Tagging object cid: {%s} with pid: {%s}.", + cid, + pid, + ) + self._is_string_none_or_empty(pid, "pid", "tag_object") + self._is_string_none_or_empty(cid, "cid", "tag_object") # Wait for the cid to release if it's being tagged while cid in self.reference_locked_cids: logging.debug( @@ -494,19 +500,21 @@ def tag_object(self, pid, cid): ) self.reference_locked_cids.append(cid) try: - # Check to see if reference file already exists for the cid + # TODO: Review process and test what happens when specific pieces fail + # We cannot have a pid ref file whose pid is not referenced in the cid refs file cid_ref_abs_path = self.get_refs_abs_path("cid", cid) if os.path.exists(cid_ref_abs_path): # If it does, read the file and add the new pid on its own line self.update_cid_refs(cid_ref_abs_path, pid) else: - # If not, create the cid ref file in '.../refs/cid' and write the pid - self.create_path(os.path.dirname(cid_ref_abs_path)) - self.write_cid_refs_file(cid_ref_abs_path, pid) - # Then create the pid ref file in '.../refs/pid' with the cid as its content + # If not, create the pid ref file in '.../refs/pid' with the cid as its content pid_ref_abs_path = self.get_refs_abs_path("pid", pid) self.create_path(os.path.dirname(pid_ref_abs_path)) self.write_pid_refs_file(pid_ref_abs_path, cid) + # Then create the cid ref file in '.../refs/cid' and write the pid + self.create_path(os.path.dirname(cid_ref_abs_path)) + self.write_cid_refs_file(cid_ref_abs_path, pid) + return True finally: # Release cid with self.reference_lock: @@ -517,7 +525,6 @@ def tag_object(self, pid, cid): self.reference_locked_cids.remove(cid) info_msg = f"FileHashStore - tag_object: Successfully tagged cid: {cid} with pid: {pid}" logging.info(info_msg) - return def find_object(self, pid): logging.debug( @@ -843,7 +850,7 @@ def store_data_only(self, data): + f" Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) - raise IOError(exception_string) from err + raise err def _move_and_get_checksums( self, @@ -1521,6 +1528,7 @@ def _validate_object( tmp_file_size: Size of the tmp file file_size_to_validate: Expected size of the object """ + # TODO: Refactor this method and/or create a new method for Metacat client to call if file_size_to_validate is not None and file_size_to_validate > 0: if file_size_to_validate != tmp_file_size: self.delete(entity, tmp_file_name) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index af957c8d..40af07f5 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -521,6 +521,81 @@ def test_store_object_sparse_large_file(store): assert object_metadata_id == object_metadata.hex_digests.get("sha256") +def test_tag_object(pids, store): + """Test tag object returns boolean.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + object_tagged = store.tag_object(pid, object_metadata.id) + assert object_tagged + + +def test_tag_object_pid_refs_file(pids, store): + """Test tag object creates the pid reference file.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + assert os.path.exists(pid_refs_file_path) + + +def test_tag_object_pid_refs_file_content(pids, store): + """Test tag object creates the pid reference file contains the correct cid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + with open(pid_refs_file_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + assert pid_refs_cid == object_metadata.id + + +def test_tag_object_cid_refs_file(pids, store): + """Test tag object creates the cid reference file.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.id + store.tag_object(pid, object_metadata.id) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert os.path.exists(cid_refs_file_path) + + +def test_tag_object_cid_refs_file_content(pids, store): + """Test tag object tags cid reference file successfully with pid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + cid_refs_file_path = store.get_refs_abs_path("cid", object_metadata.id) + with open(cid_refs_file_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read().strip() + assert pid_refs_cid == pid + + +def test_tag_object_with_existing_cid_refs_file(pids, store): + """Test tag object raises exception when trying to add another cid to an + existing pid reference file and that a cid reference file is not created.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + another_cid = "dou.test.1" + with pytest.raises(ValueError): + store.tag_object(pid, another_cid) + + second_cid_hash = store.get_refs_abs_path("cid", another_cid) + assert not os.path.exists(second_cid_hash) + + def test_find_object(pids, store): """Test find object returns the correct content identifier (cid).""" test_dir = "tests/testdata/" From 636eeffdcec49cfdfaa90f715b3de99d221d4358 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 09:34:54 -0800 Subject: [PATCH 38/71] Rename '_mktmpfile' method to '_write_to_tmp_file_and_get_hex_digests' and update pytests --- src/hashstore/filehashstore.py | 10 ++++-- tests/test_filehashstore.py | 58 +++++++++++++++++++--------------- 2 files changed, 41 insertions(+), 27 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 5f61a11a..3d7c26d1 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -894,7 +894,11 @@ def _move_and_get_checksums( + f" file and calculating checksums for pid: {pid}" ) logging.debug(debug_msg) - hex_digests, tmp_file_name, tmp_file_size = self._mktmpfile( + ( + hex_digests, + tmp_file_name, + tmp_file_size, + ) = self._write_to_tmp_file_and_get_hex_digests( stream, additional_algorithm, checksum_algorithm ) logging.debug( @@ -981,7 +985,9 @@ def _move_and_get_checksums( return (object_cid, tmp_file_size, hex_digests) - def _mktmpfile(self, stream, additional_algorithm=None, checksum_algorithm=None): + def _write_to_tmp_file_and_get_hex_digests( + self, stream, additional_algorithm=None, checksum_algorithm=None + ): """Create a named temporary file from a `Stream` object and return its filename and a dictionary of its algorithms and hex digests. If an additionak and/or checksum algorithm is provided, it will add the respective hex digest to the dictionary. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 83cd1e62..6bfd6736 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -409,8 +409,8 @@ def test_move_and_get_checksums_file_size_raises_error(pids, store): input_stream.close() -def test_mktempfile_additional_algo(store): - """Test _mktempfile returns correct hex digests for additional algorithm.""" +def test_write_to_tmp_file_and_get_hex_digests_additional_algo(store): + """Test _write...hex_digests returns correct hex digests for additional algorithm.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -420,15 +420,15 @@ def test_mktempfile_additional_algo(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) # pylint: disable=W0212 - hex_digests, _, _ = store._mktmpfile( + hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=checksum_algo ) input_stream.close() assert hex_digests.get("sha3_256") == checksum_correct -def test_mktempfile_checksum_algo(store): - """Test _mktempfile returns correct hex digests for checksum algorithm.""" +def test_write_to_tmp_file_and_get_hex_digests_checksum_algo(store): + """Test _write...hex_digests returns correct hex digests for checksum algorithm.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -438,13 +438,15 @@ def test_mktempfile_checksum_algo(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) # pylint: disable=W0212 - hex_digests, _, _ = store._mktmpfile(input_stream, checksum_algorithm=checksum_algo) + hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( + input_stream, checksum_algorithm=checksum_algo + ) input_stream.close() assert hex_digests.get("sha3_256") == checksum_correct -def test_mktempfile_checksum_and_additional_algo(store): - """Test _mktempfile returns correct hex digests for checksum algorithm.""" +def test_write_to_tmp_file_and_get_hex_digests_checksum_and_additional_algo(store): + """Test _write...hex_digests returns correct hex digests for checksum algorithm.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -458,7 +460,7 @@ def test_mktempfile_checksum_and_additional_algo(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) # pylint: disable=W0212 - hex_digests, _, _ = store._mktmpfile( + hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=additional_algo, checksum_algorithm=checksum_algo, @@ -468,8 +470,10 @@ def test_mktempfile_checksum_and_additional_algo(store): assert hex_digests.get("sha224") == additional_algo_checksum -def test_mktempfile_checksum_and_additional_algo_duplicate(store): - """Test _mktempfile succeeds with duplicate algorithms (de-duplicates).""" +def test_write_to_tmp_file_and_get_hex_digests_checksum_and_additional_algo_duplicate( + store, +): + """Test _write...hex_digests succeeds with duplicate algorithms (de-duplicates).""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -478,7 +482,7 @@ def test_mktempfile_checksum_and_additional_algo_duplicate(store): checksum_algo = "sha224" checksum_correct = "9b3a96f434f3c894359193a63437ef86fbd5a1a1a6cc37f1d5013ac1" # pylint: disable=W0212 - hex_digests, _, _ = store._mktmpfile( + hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=additional_algo, checksum_algorithm=checksum_algo, @@ -487,26 +491,26 @@ def test_mktempfile_checksum_and_additional_algo_duplicate(store): assert hex_digests.get("sha224") == checksum_correct -def test_mktempfile_file_size(pids, store): - """Test _mktempfile returns correct file size.""" +def test_write_to_tmp_file_and_get_hex_digests_file_size(pids, store): + """Test _write...hex_digests returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") # pylint: disable=W0212 - _, _, tmp_file_size = store._mktmpfile(input_stream) + _, _, tmp_file_size = store._write_to_tmp_file_and_get_hex_digests(input_stream) input_stream.close() assert tmp_file_size == pids[pid]["file_size_bytes"] -def test_mktempfile_hex_digests(pids, store): - """Test _mktempfile returns correct hex digests.""" +def test_write_to_tmp_file_and_get_hex_digests_hex_digests(pids, store): + """Test _write...hex_digests returns correct hex digests.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") # pylint: disable=W0212 - hex_digests, _, _ = store._mktmpfile(input_stream) + hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests(input_stream) input_stream.close() assert hex_digests.get("md5") == pids[pid]["md5"] assert hex_digests.get("sha1") == pids[pid]["sha1"] @@ -515,20 +519,20 @@ def test_mktempfile_hex_digests(pids, store): assert hex_digests.get("sha512") == pids[pid]["sha512"] -def test_mktempfile_tmpfile_object(pids, store): - """Test _mktempfile creates file successfully.""" +def test_write_to_tmp_file_and_get_hex_digests_tmpfile_object(pids, store): + """Test _write...hex_digests creates file successfully.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") # pylint: disable=W0212 - _, tmp_file_name, _ = store._mktmpfile(input_stream) + _, tmp_file_name, _ = store._write_to_tmp_file_and_get_hex_digests(input_stream) input_stream.close() assert os.path.isfile(tmp_file_name) is True -def test_mktempfile_with_unsupported_algorithm(pids, store): - """Test _mktempfile raises error when bad algorithm supplied.""" +def test_write_to_tmp_file_and_get_hex_digests_with_unsupported_algorithm(pids, store): + """Test _write...hex_digests raises error when bad algorithm supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -536,10 +540,14 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): algo = "md2" with pytest.raises(ValueError): # pylint: disable=W0212 - _, _, _ = store._mktmpfile(input_stream, additional_algorithm=algo) + _, _, _ = store._write_to_tmp_file_and_get_hex_digests( + input_stream, additional_algorithm=algo + ) with pytest.raises(ValueError): # pylint: disable=W0212 - _, _, _ = store._mktmpfile(input_stream, checksum_algorithm=algo) + _, _, _ = store._write_to_tmp_file_and_get_hex_digests( + input_stream, checksum_algorithm=algo + ) input_stream.close() From ee790ebfe91f59f5814b0552c09abe15ed67e7c5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 09:43:31 -0800 Subject: [PATCH 39/71] Extract new method '_mktmpfile' from '_write_to_tmp_file_and_get_hex_digests' and add new pytest --- src/hashstore/filehashstore.py | 64 ++++++++++++++++++++++------------ tests/test_filehashstore.py | 9 +++++ 2 files changed, 50 insertions(+), 23 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3d7c26d1..6109c4eb 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1011,26 +1011,11 @@ def _write_to_tmp_file_and_get_hex_digests( # Physically create directory if it doesn't exist if os.path.exists(tmp_root_path) is False: self.create_path(tmp_root_path) - tmp = NamedTemporaryFile(dir=tmp_root_path, delete=False) - - # Delete tmp file if python interpreter crashes or thread is interrupted - # when store_object is called - def delete_tmp_file(): - if os.path.exists(tmp.name): - os.remove(tmp.name) - - atexit.register(delete_tmp_file) - - # Ensure tmp file is created with desired permissions - if self.fmode is not None: - oldmask = os.umask(0) - try: - os.chmod(tmp.name, self.fmode) - finally: - os.umask(oldmask) + tmp = self._mktmpfile(tmp_root_path) logging.debug( - "FileHashStore - _mktempfile: tmp file created: %s, calculating hex digests.", + "FileHashStore - _write_to_tmp_file_and_get_hex_digests: tmp file created:" + + " %s, calculating hex digests.", tmp.name, ) @@ -1047,7 +1032,8 @@ def delete_tmp_file(): for hash_algorithm in hash_algorithms: hash_algorithm.update(self._to_bytes(data)) logging.debug( - "FileHashStore - _mktempfile: Object stream successfully written to tmp file: %s", + "FileHashStore - _write_to_tmp_file_and_get_hex_digests: Object stream" + + " successfully written to tmp file: %s", tmp.name, ) @@ -1059,19 +1045,23 @@ def delete_tmp_file(): # Ready for validation and atomic move tmp_file_completion_flag = True - logging.debug("FileHashStore - _mktempfile: Hex digests calculated.") + logging.debug( + "FileHashStore - _write_to_tmp_file_and_get_hex_digests: Hex digests calculated." + ) return hex_digest_dict, tmp.name, tmp_file_size # pylint: disable=W0718 except Exception as err: exception_string = ( - f"FileHashStore - _mktempfile: Unexpected {err=}, {type(err)=}" + "FileHashStore - _write_to_tmp_file_and_get_hex_digests:" + + f" Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) # pylint: disable=W0707,W0719 raise Exception(exception_string) except KeyboardInterrupt: exception_string = ( - "FileHashStore - _mktempfile: Keyboard interruption by user." + "FileHashStore - _write_to_tmp_file_and_get_hex_digests:" + + " Keyboard interruption by user." ) logging.error(exception_string) if os.path.exists(tmp.name): @@ -1084,11 +1074,39 @@ def delete_tmp_file(): # pylint: disable=W0718 except Exception as err: exception_string = ( - f"FileHashStore - _mktempfile: Unexpected {err=} while attempting to" + "FileHashStore - _write_to_tmp_file_and_get_hex_digests:" + + f"Unexpected {err=} while attempting to" + f" delete tmp file: {tmp.name}, {type(err)=}" ) logging.error(exception_string) + def _mktmpfile(self, path): + """Create a temporary file at the given path ready to be written. + + Args: + path (string): Path to the file location + + Returns: + tmp (file object): object with file-like interface + """ + tmp = NamedTemporaryFile(dir=path, delete=False) + + # Delete tmp file if python interpreter crashes or thread is interrupted + def delete_tmp_file(): + if os.path.exists(tmp.name): + os.remove(tmp.name) + + atexit.register(delete_tmp_file) + + # Ensure tmp file is created with desired permissions + if self.fmode is not None: + oldmask = os.umask(0) + try: + os.chmod(tmp.name, self.fmode) + finally: + os.umask(oldmask) + return tmp + def write_cid_refs_file(self, cid_ref_abs_path, pid): """Write the reference file for the given content identifier (cid). A reference file contains every pid that references a cid each on its own line. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 6bfd6736..1402a24d 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -551,6 +551,15 @@ def test_write_to_tmp_file_and_get_hex_digests_with_unsupported_algorithm(pids, input_stream.close() +def test_mktmpfile(store): + """Test that _mktmpfile creates and returns a tmp file.""" + path = store.root + "/doutest/tmp/" + store.create_path(path) + # pylint: disable=W0212 + tmp = store._mktmpfile(path) + assert os.path.exists(tmp.name) + + def test_write_cid_refs_file(pids, store): """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): From 524947c169e96f006421ff38f38f3ff770ae2b8f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 09:44:44 -0800 Subject: [PATCH 40/71] Refactor '_mktmpmetadata' method --- src/hashstore/filehashstore.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 6109c4eb..14ecc46c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1411,14 +1411,7 @@ def _mktmpmetadata(self, stream): if os.path.exists(tmp_root_path) is False: self.create_path(tmp_root_path) - tmp = NamedTemporaryFile(dir=tmp_root_path, delete=False) - # Ensure tmp file is created with desired permissions - if self.fmode is not None: - oldmask = os.umask(0) - try: - os.chmod(tmp.name, self.fmode) - finally: - os.umask(oldmask) + tmp = self._mktmpfile(tmp_root_path) # tmp is a file-like object that is already opened for writing by default logging.debug( From d77a10e52f0780cbe3b078d00925ed998aad19a5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 11:58:28 -0800 Subject: [PATCH 41/71] Refactor 'write_pid_refs_file' to throw exception immediately if refs file for given pid exists. --- src/hashstore/filehashstore.py | 18 +++++++----------- tests/test_filehashstore.py | 23 +++-------------------- tests/test_filehashstore_interface.py | 2 +- 3 files changed, 11 insertions(+), 32 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 14ecc46c..f6e65168 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -516,6 +516,7 @@ def tag_object(self, pid, cid): self.write_cid_refs_file(cid_ref_abs_path, pid) return True finally: + # TODO: Verify that the reference files have been written as expected. # Release cid with self.reference_lock: logging.debug( @@ -1277,17 +1278,12 @@ def write_pid_refs_file(self, pid_ref_abs_path, cid): logging.info(info_msg) if os.path.exists(pid_ref_abs_path): - with open(pid_ref_abs_path, "r", encoding="utf8") as f: - pid_refs_cid = f.read() - if pid_refs_cid == cid: - return - else: - exception_string = ( - "FileHashStore - write_pid_refs_file: pid reference file exists but" - + f" cid ({cid}) is different from cid stored ({pid_refs_cid})." - ) - logging.error(exception_string) - raise ValueError(exception_string) + exception_string = ( + "FileHashStore - write_pid_refs_file: pid ref file already exists for %s", + pid_ref_abs_path, + ) + logging.error(exception_string) + raise FileExistsError(exception_string) else: try: with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 1402a24d..7eeecaa4 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -717,32 +717,15 @@ def test_write_pid_refs_file_content(pids, store): def test_write_pid_refs_file_exists(pids, store): - """Test that write_pid_refs_file returns when ref already exists and the - cid given is the same.""" + """Test that write_pid_refs_file throws exception if ref file already exists.""" for pid in pids.keys(): cid = pids[pid]["sha256"] pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store.write_pid_refs_file(pid_ref_abs_path, cid) # This should not write and return - store.write_pid_refs_file(pid_ref_abs_path, cid) - - with open(pid_ref_abs_path, "r", encoding="utf8") as f: - pid_refs_cid = f.read() - - assert cid == pid_refs_cid - - -def test_write_pid_refs_file_exists_different_cid(pids, store): - """Test that write_pid_refs_file returns when ref already exists and the - cid given is the same.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - store.write_pid_refs_file(pid_ref_abs_path, cid) - with pytest.raises(ValueError): - store.write_pid_refs_file(pid_ref_abs_path, "abc123") + with pytest.raises(FileExistsError): + store.write_pid_refs_file(pid_ref_abs_path, cid) def test_delete_pid_refs_file(pids, store): diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 40af07f5..94097d3a 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -589,7 +589,7 @@ def test_tag_object_with_existing_cid_refs_file(pids, store): object_metadata = store.store_object(pid, path) store.tag_object(pid, object_metadata.id) another_cid = "dou.test.1" - with pytest.raises(ValueError): + with pytest.raises(FileExistsError): store.tag_object(pid, another_cid) second_cid_hash = store.get_refs_abs_path("cid", another_cid) From 562cf79195d0411f5e6ae37eca1642be1c79eb77 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 13:08:34 -0800 Subject: [PATCH 42/71] Refactor 'tag_object' process and related methods and fix bug in 'update_cid_refs()' --- src/hashstore/filehashstore.py | 73 +++++++++++++++++----------------- 1 file changed, 36 insertions(+), 37 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index f6e65168..d280e594 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -502,13 +502,20 @@ def tag_object(self, pid, cid): try: # TODO: Review process and test what happens when specific pieces fail # We cannot have a pid ref file whose pid is not referenced in the cid refs file + pid_ref_abs_path = self.get_refs_abs_path("pid", pid) cid_ref_abs_path = self.get_refs_abs_path("cid", cid) - if os.path.exists(cid_ref_abs_path): + if os.path.exists(pid_ref_abs_path): + exception_string = ( + "FileHashStore - write_pid_refs_file: pid ref file already exists for %s", + pid_ref_abs_path, + ) + logging.error(exception_string) + raise FileExistsError(exception_string) + elif os.path.exists(cid_ref_abs_path): # If it does, read the file and add the new pid on its own line self.update_cid_refs(cid_ref_abs_path, pid) else: # If not, create the pid ref file in '.../refs/pid' with the cid as its content - pid_ref_abs_path = self.get_refs_abs_path("pid", pid) self.create_path(os.path.dirname(pid_ref_abs_path)) self.write_pid_refs_file(pid_ref_abs_path, cid) # Then create the cid ref file in '.../refs/cid' and write the pid @@ -1153,19 +1160,19 @@ def update_cid_refs(self, cid_ref_abs_path, pid): logging.info(info_msg) try: + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + if pid == value: + err_msg = ( + f"FileHashStore - update_cid_refs: pid ({pid}) already reference in" + + f" cid reference file: {cid_ref_abs_path} " + ) + raise ValueError(err_msg) + with open(cid_ref_abs_path, "a+", encoding="utf8") as cid_ref_file: fcntl.flock(cid_ref_file, fcntl.LOCK_EX) - # Read the ref file to see if the pid is already referencing the cid - cid_ref_file_content = cid_ref_file.read() - - if pid in cid_ref_file_content: - err_msg = ( - f"FileHashStore - update_cid_refs: pid ({pid}) already reference in" - + f" cid reference file: {cid_ref_abs_path} " - ) - raise ValueError(err_msg) - else: - cid_ref_file.write(pid + "\n") + cid_ref_file.write(pid + "\n") # The context manager will take care of releasing the lock # But the code to explicitly release the lock if desired is below # fcntl.flock(f, fcntl.LOCK_UN) @@ -1262,45 +1269,37 @@ def delete_cid_refs_file(self, cid_ref_abs_path): logging.error(exception_string) raise err - def write_pid_refs_file(self, pid_ref_abs_path, cid): + def write_pid_refs_file(self, path, cid): """Write the reference file for the given pid (persistent identifier). A reference file for a pid contains the cid that it references. Its permanent address is the pid hash with HashStore's default store algorithm and follows its directory structure. Args: - pid_ref_abs_path (string): Absolute path to the pid ref file + path (string): Path to file to be written into cid (string): Content identifier """ info_msg = ( f"FileHashStore - write_pid_refs_file: Writing cid ({cid}) into pid reference" - + f" file: {pid_ref_abs_path}" + + f" file: {path}" ) logging.info(info_msg) - if os.path.exists(pid_ref_abs_path): + try: + with open(path, "w", encoding="utf8") as pid_ref_file: + fcntl.flock(pid_ref_file, fcntl.LOCK_EX) + pid_ref_file.write(cid) + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) + return + + except Exception as err: exception_string = ( - "FileHashStore - write_pid_refs_file: pid ref file already exists for %s", - pid_ref_abs_path, + "FileHashStore - write_pid_refs_file: failed to write pid reference file:" + + f" {path} for cid: {cid}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) - raise FileExistsError(exception_string) - else: - try: - with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: - fcntl.flock(pid_ref_file, fcntl.LOCK_EX) - pid_ref_file.write(cid) - # The context manager will take care of releasing the lock - # But the code to explicitly release the lock if desired is below - # fcntl.flock(f, fcntl.LOCK_UN) - return - - except Exception as err: - exception_string = ( - "FileHashStore - write_pid_refs_file: failed to write pid reference file:" - + f" {pid_ref_abs_path} for cid: {cid}. Unexpected {err=}, {type(err)=}" - ) - logging.error(exception_string) - raise err + raise err def delete_pid_refs_file(self, pid_ref_abs_path): """Delete a pid reference file. From 59df239e9144b10455b5660b158c898787aa2fa2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 13:18:35 -0800 Subject: [PATCH 43/71] Revise pytests, and extract pytests for references related processes into its own test module 'test_filehashstore_references' --- tests/test_filehashstore.py | 188 ------------------------ tests/test_filehashstore_interface.py | 18 ++- tests/test_filehashstore_references.py | 190 +++++++++++++++++++++++++ 3 files changed, 207 insertions(+), 189 deletions(-) create mode 100644 tests/test_filehashstore_references.py diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 7eeecaa4..1b0116fa 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -560,194 +560,6 @@ def test_mktmpfile(store): assert os.path.exists(tmp.name) -def test_write_cid_refs_file(pids, store): - """Test that write_cid_reference writes a reference file.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - assert os.path.exists(cid_ref_abs_path) - - -def test_write_cid_refs_file_content(pids, store): - """Test that write_cid_ref_file writes the expected content.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - - with open(cid_ref_abs_path, "r", encoding="utf8") as f: - cid_ref_file_pid = f.read() - - assert pid == cid_ref_file_pid.replace("\n", "") - - -def test_update_cid_refs_content(pids, store): - """Test that update_cid_ref updates the ref file as expected.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - - pid_other = "dou.test.1" - store.update_cid_refs(cid_ref_abs_path, pid_other) - - with open(cid_ref_abs_path, "r", encoding="utf8") as f: - for _, line in enumerate(f, start=1): - value = line.strip() - assert value == pid or value == pid_other - - -def test_update_cid_refs_content_multiple(pids, store): - """Test that update_cid_refs adds multiple references successfully.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - - cid_reference_list = [pid] - for i in range(0, 5): - store.update_cid_refs(cid_ref_abs_path, f"dou.test.{i}") - cid_reference_list.append(f"dou.test.{i}") - - line_count = 0 - with open(cid_ref_abs_path, "r", encoding="utf8") as f: - for _, line in enumerate(f, start=1): - line_count += 1 - value = line.strip() - assert value in cid_reference_list - - assert line_count == 6 - - -def test_delete_cid_refs_pid(pids, store): - """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - - pid_other = "dou.test.1" - store.update_cid_refs(cid_ref_abs_path, pid_other) - store.delete_cid_refs_pid(cid_ref_abs_path, pid) - - with open(cid_ref_abs_path, "r", encoding="utf8") as f: - for _, line in enumerate(f, start=1): - value = line.strip() - print(value) - assert value == pid_other - - -def test_delete_cid_refs_pid_pid_not_found(pids, store): - """Test that delete_cid_refs_pid raises exception when pid not found.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - - pid_other = "dou.test.1" - store.update_cid_refs(cid_ref_abs_path, pid_other) - with pytest.raises(ValueError): - store.delete_cid_refs_pid(cid_ref_abs_path, "dou.not.found.1") - - -def test_delete_cid_refs_pid_file(pids, store): - """Test that delete_cid_refs_file deletes a reference file.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - store.delete_cid_refs_pid(cid_ref_abs_path, pid) - cid_refs_deleted = store.delete_cid_refs_file(cid_ref_abs_path) - - assert cid_refs_deleted - assert not os.path.exists(cid_ref_abs_path) - - -def test_delete_cid_refs_pid_file_not_empty(pids, store): - """Test that delete_cid_refs_file does not raise an exception when refs file - is not empty.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - cid_refs_deleted = store.delete_cid_refs_file(cid_ref_abs_path) - assert not cid_refs_deleted - - -def test_delete_cid_refs_pid_file_not_found(pids, store): - """Test that delete_cid_refs_file raises an exception when refs file not found.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - cid_ref_abs_path = store.get_refs_abs_path("cid", cid) - with pytest.raises(FileNotFoundError): - store.delete_cid_refs_file(cid_ref_abs_path) - - -def test_write_pid_refs_file(pids, store): - """Test that write_pid_refs_file writes a reference file.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - store.write_pid_refs_file(pid_ref_abs_path, cid) - assert os.path.exists(pid_ref_abs_path) - - -def test_write_pid_refs_file_content(pids, store): - """Test that write_pid_refs_file writes the expected content.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - store.write_pid_refs_file(pid_ref_abs_path, cid) - - with open(pid_ref_abs_path, "r", encoding="utf8") as f: - pid_refs_cid = f.read() - - assert cid == pid_refs_cid - - -def test_write_pid_refs_file_exists(pids, store): - """Test that write_pid_refs_file throws exception if ref file already exists.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - store.write_pid_refs_file(pid_ref_abs_path, cid) - # This should not write and return - with pytest.raises(FileExistsError): - store.write_pid_refs_file(pid_ref_abs_path, cid) - - -def test_delete_pid_refs_file(pids, store): - """Test that delete_pid_refs_file deletes a reference file.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - store.write_pid_refs_file(pid_ref_abs_path, cid) - store.delete_pid_refs_file(pid_ref_abs_path) - - assert not os.path.exists(pid_ref_abs_path) - - -def test_delete_pid_refs_file_file_not_found(pids, store): - """Test that delete_pid_refs_file raises an exception when refs file not found.""" - for pid in pids.keys(): - pid_ref_abs_path = store.get_refs_abs_path("pid", pid) - with pytest.raises(FileNotFoundError): - store.delete_cid_refs_file(pid_ref_abs_path) - - def test_put_metadata_with_path(pids, store): """Test put_metadata with path object.""" entity = "metadata" diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 94097d3a..e7ead830 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -542,6 +542,22 @@ def test_tag_object_pid_refs_file(pids, store): assert os.path.exists(pid_refs_file_path) +def test_tag_object_pid_refs_file_exists(pids, store): + """Test tag object throws exception when pid refs file already exists.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.id + store.tag_object(pid, cid) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + assert os.path.exists(pid_refs_file_path) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert os.path.exists(cid_refs_file_path) + with pytest.raises(FileExistsError): + store.tag_object(pid, cid) + + def test_tag_object_pid_refs_file_content(pids, store): """Test tag object creates the pid reference file contains the correct cid.""" test_dir = "tests/testdata/" @@ -580,7 +596,7 @@ def test_tag_object_cid_refs_file_content(pids, store): assert pid_refs_cid == pid -def test_tag_object_with_existing_cid_refs_file(pids, store): +def test_tag_object_cid_refs_file_exists(pids, store): """Test tag object raises exception when trying to add another cid to an existing pid reference file and that a cid reference file is not created.""" test_dir = "tests/testdata/" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py new file mode 100644 index 00000000..63fc54da --- /dev/null +++ b/tests/test_filehashstore_references.py @@ -0,0 +1,190 @@ +"""Test module for FileHashStore core, utility and supporting methods""" +import os +import pytest + + +def test_write_cid_refs_file(pids, store): + """Test that write_cid_reference writes a reference file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + assert os.path.exists(cid_ref_abs_path) + + +def test_write_cid_refs_file_content(pids, store): + """Test that write_cid_ref_file writes the expected content.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + cid_ref_file_pid = f.read() + + assert pid == cid_ref_file_pid.strip() + + +def test_update_cid_refs_content(pids, store): + """Test that update_cid_ref updates the ref file as expected.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + + pid_other = "dou.test.1" + store.update_cid_refs(cid_ref_abs_path, pid_other) + + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + assert value == pid or value == pid_other + + +def test_update_cid_refs_content_multiple(pids, store): + """Test that update_cid_refs adds multiple references successfully.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + + cid_reference_list = [pid] + for i in range(0, 5): + store.update_cid_refs(cid_ref_abs_path, f"dou.test.{i}") + cid_reference_list.append(f"dou.test.{i}") + + line_count = 0 + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + line_count += 1 + value = line.strip() + assert value in cid_reference_list + + assert line_count == 6 + + +def test_update_cid_refs_content_pid_exists(pids, store): + """Test that update_cid_ref does not write pid if pid already exists""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + with pytest.raises(ValueError): + store.update_cid_refs(cid_ref_abs_path, pid) + + +def test_delete_cid_refs_pid(pids, store): + """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + + pid_other = "dou.test.1" + store.update_cid_refs(cid_ref_abs_path, pid_other) + store.delete_cid_refs_pid(cid_ref_abs_path, pid) + + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + print(value) + assert value == pid_other + + +def test_delete_cid_refs_pid_pid_not_found(pids, store): + """Test that delete_cid_refs_pid raises exception when pid not found.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + + pid_other = "dou.test.1" + store.update_cid_refs(cid_ref_abs_path, pid_other) + with pytest.raises(ValueError): + store.delete_cid_refs_pid(cid_ref_abs_path, "dou.not.found.1") + + +def test_delete_cid_refs_pid_file(pids, store): + """Test that delete_cid_refs_file deletes a reference file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + store.delete_cid_refs_pid(cid_ref_abs_path, pid) + cid_refs_deleted = store.delete_cid_refs_file(cid_ref_abs_path) + + assert cid_refs_deleted + assert not os.path.exists(cid_ref_abs_path) + + +def test_delete_cid_refs_pid_file_not_empty(pids, store): + """Test that delete_cid_refs_file does not raise an exception when refs file + is not empty.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store.write_cid_refs_file(cid_ref_abs_path, pid) + cid_refs_deleted = store.delete_cid_refs_file(cid_ref_abs_path) + assert not cid_refs_deleted + + +def test_delete_cid_refs_pid_file_not_found(pids, store): + """Test that delete_cid_refs_file raises an exception when refs file not found.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + with pytest.raises(FileNotFoundError): + store.delete_cid_refs_file(cid_ref_abs_path) + + +def test_write_pid_refs_file(pids, store): + """Test that write_pid_refs_file writes a reference file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store.write_pid_refs_file(pid_ref_abs_path, cid) + assert os.path.exists(pid_ref_abs_path) + + +def test_write_pid_refs_file_content(pids, store): + """Test that write_pid_refs_file writes the expected content.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store.write_pid_refs_file(pid_ref_abs_path, cid) + + with open(pid_ref_abs_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + + assert cid == pid_refs_cid + + +def test_delete_pid_refs_file(pids, store): + """Test that delete_pid_refs_file deletes a reference file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store.write_pid_refs_file(pid_ref_abs_path, cid) + store.delete_pid_refs_file(pid_ref_abs_path) + + assert not os.path.exists(pid_ref_abs_path) + + +def test_delete_pid_refs_file_file_not_found(pids, store): + """Test that delete_pid_refs_file raises an exception when refs file not found.""" + for pid in pids.keys(): + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + with pytest.raises(FileNotFoundError): + store.delete_cid_refs_file(pid_ref_abs_path) From ebfe61098533b5383c08639c7ff049f80252aea3 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 14:10:26 -0800 Subject: [PATCH 44/71] Refactor 'tag_object' process to be atomic and clean up code --- src/hashstore/filehashstore.py | 87 +++++++++++++++----------- tests/test_filehashstore.py | 17 ++--- tests/test_filehashstore_interface.py | 3 +- tests/test_filehashstore_references.py | 52 +++++++-------- 4 files changed, 89 insertions(+), 70 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index d280e594..e8e265c9 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -515,12 +515,29 @@ def tag_object(self, pid, cid): # If it does, read the file and add the new pid on its own line self.update_cid_refs(cid_ref_abs_path, pid) else: - # If not, create the pid ref file in '.../refs/pid' with the cid as its content + # All ref files begin as tmp files and get moved sequentially at once + # Ensure refs tmp folder exists + tmp_root_path = self.get_store_path("refs") / "tmp" + # Physically create directory if it doesn't exist + if os.path.exists(tmp_root_path) is False: + self.create_path(tmp_root_path) + + # Then write pid_refs_file content into tmp file + pid_tmp_file = self._mktmpfile(tmp_root_path) + pid_tmp_file_path = pid_tmp_file.name + self._write_pid_refs_file(pid_tmp_file_path, cid) + # Then write cid_refs_file content into tmp file + cid_tmp_file = self._mktmpfile(tmp_root_path) + cid_tmp_file_path = cid_tmp_file.name + self._write_cid_refs_file(cid_tmp_file_path, pid) + + # Create path for pid ref file in '.../refs/pid' self.create_path(os.path.dirname(pid_ref_abs_path)) - self.write_pid_refs_file(pid_ref_abs_path, cid) - # Then create the cid ref file in '.../refs/cid' and write the pid + # Create path for cid ref file in '.../refs/cid' self.create_path(os.path.dirname(cid_ref_abs_path)) - self.write_cid_refs_file(cid_ref_abs_path, pid) + # Move both files + shutil.move(pid_tmp_file_path, pid_ref_abs_path) + shutil.move(cid_tmp_file_path, cid_ref_abs_path) return True finally: # TODO: Verify that the reference files have been written as expected. @@ -676,13 +693,13 @@ def delete_object(self, pid): try: # Remove pid from cid reference file cid_ref_abs_path = self.get_refs_abs_path("cid", cid) - self.delete_cid_refs_pid(cid_ref_abs_path, pid) + self._delete_cid_refs_pid(cid_ref_abs_path, pid) # Delete cid reference file # If the file is not empty, it will not be deleted. - cid_refs_deleted = self.delete_cid_refs_file(cid_ref_abs_path) + cid_refs_deleted = self._delete_cid_refs_file(cid_ref_abs_path) # Delete pid reference file pid_ref_abs_path = self.get_refs_abs_path("pid", pid) - self.delete_pid_refs_file(pid_ref_abs_path) + self._delete_pid_refs_file(pid_ref_abs_path) # Finally, delete the object if cid_refs_deleted: entity = "objects" @@ -1115,22 +1132,20 @@ def delete_tmp_file(): os.umask(oldmask) return tmp - def write_cid_refs_file(self, cid_ref_abs_path, pid): - """Write the reference file for the given content identifier (cid). A reference - file contains every pid that references a cid each on its own line. + def _write_cid_refs_file(self, path, pid): + """Write the reference file in the supplied path for the given content + identifier (cid). A reference file contains every pid that references a + cid each on its own line. Args: - cid_ref_abs_path (string): Absolute path to the cid ref file + path (string): Path of file to be written into pid (string): Authority-based or persistent identifier of object """ - info_msg = ( - f"FileHashStore - write_cid_refs_file: Writing pid ({pid}) into cid reference" - + f" file: {cid_ref_abs_path}" - ) + info_msg = f"FileHashStore - write_cid_refs_file: Writing pid ({pid}) into file: {path}" logging.info(info_msg) try: - with open(cid_ref_abs_path, "w", encoding="utf8") as cid_ref_file: + with open(path, "w", encoding="utf8") as cid_ref_file: fcntl.flock(cid_ref_file, fcntl.LOCK_EX) cid_ref_file.write(pid + "\n") # The context manager will take care of releasing the lock @@ -1140,13 +1155,13 @@ def write_cid_refs_file(self, cid_ref_abs_path, pid): except Exception as err: exception_string = ( - "FileHashStore - write_cid_refs_file: failed to write reference for cid:" - + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" + f"FileHashStore - write_cid_refs_file: failed to write pid ({pid})" + + f" into path: {path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise err - def update_cid_refs(self, cid_ref_abs_path, pid): + def _update_cid_refs(self, cid_ref_abs_path, pid): """Update an existing cid reference file with the given pid. Args: @@ -1186,7 +1201,7 @@ def update_cid_refs(self, cid_ref_abs_path, pid): logging.error(exception_string) raise err - def delete_cid_refs_pid(self, cid_ref_abs_path, pid): + def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): """Delete a pid from a cid reference file. Args: @@ -1228,7 +1243,7 @@ def delete_cid_refs_pid(self, cid_ref_abs_path, pid): logging.error(exception_string) raise err - def delete_cid_refs_file(self, cid_ref_abs_path): + def _delete_cid_refs_file(self, cid_ref_abs_path): """Delete a cid reference file. There must be no references remaining. Args: @@ -1269,19 +1284,17 @@ def delete_cid_refs_file(self, cid_ref_abs_path): logging.error(exception_string) raise err - def write_pid_refs_file(self, path, cid): - """Write the reference file for the given pid (persistent identifier). A reference - file for a pid contains the cid that it references. Its permanent address is the pid - hash with HashStore's default store algorithm and follows its directory structure. + def _write_pid_refs_file(self, path, cid): + """Write the reference file in the supplied path for the given pid (persistent + identifier). A reference file for a pid contains the cid that it references. + Its permanent address is the pid hash using HashStore's default store algorithm + and follows its directory structure. Args: - path (string): Path to file to be written into + path (string): Path of file to be written into cid (string): Content identifier """ - info_msg = ( - f"FileHashStore - write_pid_refs_file: Writing cid ({cid}) into pid reference" - + f" file: {path}" - ) + info_msg = f"FileHashStore - write_pid_refs_file: Writing cid ({cid}) into file: {path}" logging.info(info_msg) try: @@ -1295,13 +1308,13 @@ def write_pid_refs_file(self, path, cid): except Exception as err: exception_string = ( - "FileHashStore - write_pid_refs_file: failed to write pid reference file:" - + f" {path} for cid: {cid}. Unexpected {err=}, {type(err)=}" + f"FileHashStore - write_pid_refs_file: failed to write cid ({cid})" + + f" into path: {path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise err - def delete_pid_refs_file(self, pid_ref_abs_path): + def _delete_pid_refs_file(self, pid_ref_abs_path): """Delete a pid reference file. Args: @@ -1498,7 +1511,7 @@ def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): self.clean_algorithm(checksum_algorithm) if checksum_algorithm in self.other_algo_list: debug_additional_other_algo_str = ( - f"FileHashStore - _mktempfile: checksum algorithm: {checksum_algorithm}" + f"FileHashStore - _refine_algorithm_list: checksum algo: {checksum_algorithm}" + " found in other_algo_lists, adding to list of algorithms to calculate." ) logging.debug(debug_additional_other_algo_str) @@ -1507,7 +1520,7 @@ def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): self.clean_algorithm(additional_algorithm) if additional_algorithm in self.other_algo_list: debug_additional_other_algo_str = ( - f"FileHashStore - _mktempfile: additional algorithm: {additional_algorithm}" + f"FileHashStore - _refine_algorithm_list: addit algo: {additional_algorithm}" + " found in other_algo_lists, adding to list of algorithms to calculate." ) logging.debug(debug_additional_other_algo_str) @@ -1764,9 +1777,9 @@ def delete(self, entity, file): except OSError: pass else: - self.remove_empty(os.path.dirname(realpath)) + self._remove_empty(os.path.dirname(realpath)) - def remove_empty(self, subpath): + def _remove_empty(self, subpath): """Successively remove all empty folders starting with `subpath` and proceeding "up" through directory tree until reaching the `root` folder. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 1b0116fa..801257b4 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -738,9 +738,10 @@ def test_remove_empty_removes_empty_folders_string(store): assert os.path.exists(os.path.join(store.root, three_dirs)) assert os.path.exists(os.path.join(store.root, two_dirs)) assert os.path.exists(os.path.join(store.root, one_dir)) - store.remove_empty(os.path.join(store.root, three_dirs)) - store.remove_empty(os.path.join(store.root, two_dirs)) - store.remove_empty(os.path.join(store.root, one_dir)) + # pylint: disable=W0212 + store._remove_empty(os.path.join(store.root, three_dirs)) + store._remove_empty(os.path.join(store.root, two_dirs)) + store._remove_empty(os.path.join(store.root, one_dir)) assert not os.path.exists(os.path.join(store.root, three_dirs)) assert not os.path.exists(os.path.join(store.root, two_dirs)) assert not os.path.exists(os.path.join(store.root, one_dir)) @@ -757,9 +758,10 @@ def test_remove_empty_removes_empty_folders_path(store): assert (store.root / three_dirs).exists() assert (store.root / two_dirs).exists() assert (store.root / one_dir).exists() - store.remove_empty(store.root / three_dirs) - store.remove_empty(store.root / two_dirs) - store.remove_empty(store.root / one_dir) + # pylint: disable=W0212 + store._remove_empty(store.root / three_dirs) + store._remove_empty(store.root / two_dirs) + store._remove_empty(store.root / one_dir) assert not (store.root / three_dirs).exists() assert not (store.root / two_dirs).exists() assert not (store.root / one_dir).exists() @@ -776,7 +778,8 @@ def test_remove_empty_does_not_remove_nonempty_folders(pids, store): # Get parent directory of the relative path parent_dir = os.path.dirname(object_metadata_shard_path) # Attempt to remove the parent directory - store.remove_empty(parent_dir) + # pylint: disable=W0212 + store._remove_empty(parent_dir) abs_parent_dir = store.objects + "/" + parent_dir assert os.path.exists(abs_parent_dir) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index e7ead830..ac806545 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -953,7 +953,8 @@ def test_delete_objects_cid_refs_file_with_pid_refs_remaining(pids, store): cid = object_metadata.id store.tag_object(pid, cid) cid_refs_abs_path = store.get_refs_abs_path("cid", cid) - store.update_cid_refs(cid_refs_abs_path, "dou.test.1") + # pylint: disable=W0212 + store._update_cid_refs(cid_refs_abs_path, "dou.test.1") _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) cid_refs_file_path = store.get_refs_abs_path("cid", cid) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 63fc54da..4b86055c 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -2,6 +2,8 @@ import os import pytest +# pylint: disable=W0212 + def test_write_cid_refs_file(pids, store): """Test that write_cid_reference writes a reference file.""" @@ -9,7 +11,7 @@ def test_write_cid_refs_file(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) + store._write_cid_refs_file(cid_ref_abs_path, pid) assert os.path.exists(cid_ref_abs_path) @@ -19,7 +21,7 @@ def test_write_cid_refs_file_content(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) + store._write_cid_refs_file(cid_ref_abs_path, pid) with open(cid_ref_abs_path, "r", encoding="utf8") as f: cid_ref_file_pid = f.read() @@ -33,10 +35,10 @@ def test_update_cid_refs_content(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) + store._write_cid_refs_file(cid_ref_abs_path, pid) pid_other = "dou.test.1" - store.update_cid_refs(cid_ref_abs_path, pid_other) + store._update_cid_refs(cid_ref_abs_path, pid_other) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): @@ -50,11 +52,11 @@ def test_update_cid_refs_content_multiple(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) + store._write_cid_refs_file(cid_ref_abs_path, pid) cid_reference_list = [pid] for i in range(0, 5): - store.update_cid_refs(cid_ref_abs_path, f"dou.test.{i}") + store._update_cid_refs(cid_ref_abs_path, f"dou.test.{i}") cid_reference_list.append(f"dou.test.{i}") line_count = 0 @@ -73,9 +75,9 @@ def test_update_cid_refs_content_pid_exists(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) + store._write_cid_refs_file(cid_ref_abs_path, pid) with pytest.raises(ValueError): - store.update_cid_refs(cid_ref_abs_path, pid) + store._update_cid_refs(cid_ref_abs_path, pid) def test_delete_cid_refs_pid(pids, store): @@ -84,11 +86,11 @@ def test_delete_cid_refs_pid(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) + store._write_cid_refs_file(cid_ref_abs_path, pid) pid_other = "dou.test.1" - store.update_cid_refs(cid_ref_abs_path, pid_other) - store.delete_cid_refs_pid(cid_ref_abs_path, pid) + store._update_cid_refs(cid_ref_abs_path, pid_other) + store._delete_cid_refs_pid(cid_ref_abs_path, pid) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): @@ -103,12 +105,12 @@ def test_delete_cid_refs_pid_pid_not_found(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) + store._write_cid_refs_file(cid_ref_abs_path, pid) pid_other = "dou.test.1" - store.update_cid_refs(cid_ref_abs_path, pid_other) + store._update_cid_refs(cid_ref_abs_path, pid_other) with pytest.raises(ValueError): - store.delete_cid_refs_pid(cid_ref_abs_path, "dou.not.found.1") + store._delete_cid_refs_pid(cid_ref_abs_path, "dou.not.found.1") def test_delete_cid_refs_pid_file(pids, store): @@ -117,9 +119,9 @@ def test_delete_cid_refs_pid_file(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - store.delete_cid_refs_pid(cid_ref_abs_path, pid) - cid_refs_deleted = store.delete_cid_refs_file(cid_ref_abs_path) + store._write_cid_refs_file(cid_ref_abs_path, pid) + store._delete_cid_refs_pid(cid_ref_abs_path, pid) + cid_refs_deleted = store._delete_cid_refs_file(cid_ref_abs_path) assert cid_refs_deleted assert not os.path.exists(cid_ref_abs_path) @@ -132,8 +134,8 @@ def test_delete_cid_refs_pid_file_not_empty(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) - store.write_cid_refs_file(cid_ref_abs_path, pid) - cid_refs_deleted = store.delete_cid_refs_file(cid_ref_abs_path) + store._write_cid_refs_file(cid_ref_abs_path, pid) + cid_refs_deleted = store._delete_cid_refs_file(cid_ref_abs_path) assert not cid_refs_deleted @@ -143,7 +145,7 @@ def test_delete_cid_refs_pid_file_not_found(pids, store): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) with pytest.raises(FileNotFoundError): - store.delete_cid_refs_file(cid_ref_abs_path) + store._delete_cid_refs_file(cid_ref_abs_path) def test_write_pid_refs_file(pids, store): @@ -152,7 +154,7 @@ def test_write_pid_refs_file(pids, store): cid = pids[pid]["sha256"] pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) - store.write_pid_refs_file(pid_ref_abs_path, cid) + store._write_pid_refs_file(pid_ref_abs_path, cid) assert os.path.exists(pid_ref_abs_path) @@ -162,7 +164,7 @@ def test_write_pid_refs_file_content(pids, store): cid = pids[pid]["sha256"] pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) - store.write_pid_refs_file(pid_ref_abs_path, cid) + store._write_pid_refs_file(pid_ref_abs_path, cid) with open(pid_ref_abs_path, "r", encoding="utf8") as f: pid_refs_cid = f.read() @@ -176,8 +178,8 @@ def test_delete_pid_refs_file(pids, store): cid = pids[pid]["sha256"] pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) - store.write_pid_refs_file(pid_ref_abs_path, cid) - store.delete_pid_refs_file(pid_ref_abs_path) + store._write_pid_refs_file(pid_ref_abs_path, cid) + store._delete_pid_refs_file(pid_ref_abs_path) assert not os.path.exists(pid_ref_abs_path) @@ -187,4 +189,4 @@ def test_delete_pid_refs_file_file_not_found(pids, store): for pid in pids.keys(): pid_ref_abs_path = store.get_refs_abs_path("pid", pid) with pytest.raises(FileNotFoundError): - store.delete_cid_refs_file(pid_ref_abs_path) + store._delete_cid_refs_file(pid_ref_abs_path) From ea59f3e57120adbbb5ce3a87f0f717bf81a345ff Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 15:16:41 -0800 Subject: [PATCH 45/71] Add new method '_validate_references' that is now called after atomically moving refs files in 'tag_object' --- src/hashstore/filehashstore.py | 52 +++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e8e265c9..0a5b0409 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -538,9 +538,9 @@ def tag_object(self, pid, cid): # Move both files shutil.move(pid_tmp_file_path, pid_ref_abs_path) shutil.move(cid_tmp_file_path, cid_ref_abs_path) + self._validate_references(pid, cid) return True finally: - # TODO: Verify that the reference files have been written as expected. # Release cid with self.reference_lock: logging.debug( @@ -1578,6 +1578,56 @@ def _validate_object( logging.error(exception_string) raise ValueError(exception_string) + def _validate_references(self, pid, cid): + """Verifies that the supplied pid and pid reference file and content have been + written successfully. + + Args: + pid (string): Authority-based or persistent identifier + cid (string): Content identifier + """ + # Check that reference files were created + pid_ref_abs_path = self.get_refs_abs_path("pid", pid) + cid_ref_abs_path = self.get_refs_abs_path("cid", cid) + if not os.path.exists(pid_ref_abs_path): + exception_string = ( + "FileHashStore - _validate_references: Pid refs file missing: %s", + pid_ref_abs_path, + ) + logging.error(exception_string) + raise FileNotFoundError(exception_string) + if not os.path.exists(cid_ref_abs_path): + exception_string = ( + "FileHashStore - _validate_references: Cid refs file missing: %s", + cid_ref_abs_path, + ) + logging.error(exception_string) + raise FileNotFoundError(exception_string) + # Check the content of the reference files + # Start with the cid + retrieved_cid = self.find_object(pid) + if retrieved_cid != cid: + exception_string = ( + f"FileHashStore - _validate_references: Pid refs file exists ({pid_ref_abs_path})" + + f" but cid ({cid}) does not match." + ) + logging.error(exception_string) + raise ValueError(exception_string) + # Then the pid + pid_found = False + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + if value == pid: + pid_found = True + if not pid_found: + exception_string = ( + f"FileHashStore - _validate_references: Cid refs file exists ({cid_ref_abs_path})" + + f" but pid ({pid}) not found." + ) + logging.error(exception_string) + raise ValueError(exception_string) + def _validate_metadata_to_store(self, metadata): """Evaluates a metadata argument to ensure that it is either a string, path or stream object before attempting to store it. From cef6e933f89add7ae20e3cacda5e767ddb76d333 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 13 Nov 2023 15:47:49 -0800 Subject: [PATCH 46/71] Refactor '_delete_cid_refs_file', revise pytests and add new pytests for '_validate_references' --- src/hashstore/filehashstore.py | 7 ++- tests/test_filehashstore_interface.py | 7 ++- tests/test_filehashstore_references.py | 76 ++++++++++++++++++++++++-- 3 files changed, 79 insertions(+), 11 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 0a5b0409..9ebf046e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1264,14 +1264,15 @@ def _delete_cid_refs_file(self, cid_ref_abs_path): "FileHashStore - delete_cid_refs_file: Cid reference file not found: %s", cid_ref_abs_path, ) + logging.error(err_msg) raise FileNotFoundError(err_msg) if os.path.getsize(cid_ref_abs_path) != 0: - warn_msg = ( + err_msg = ( "FileHashStore - delete_cid_refs_file: Failed to delete cid reference file." + f" File is not empty: {cid_ref_abs_path} " ) - logging.warning(warn_msg) - return False + logging.error(err_msg) + raise OSError(err_msg) else: os.remove(cid_ref_abs_path) return True diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index ac806545..afc63d53 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -956,9 +956,10 @@ def test_delete_objects_cid_refs_file_with_pid_refs_remaining(pids, store): # pylint: disable=W0212 store._update_cid_refs(cid_refs_abs_path, "dou.test.1") _metadata_cid = store.store_metadata(pid, syspath, format_id) - store.delete_object(pid) - cid_refs_file_path = store.get_refs_abs_path("cid", cid) - assert os.path.exists(cid_refs_file_path) + with pytest.raises(OSError): + store.delete_object(pid) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert os.path.exists(cid_refs_file_path) def test_delete_object_pid_empty(store): diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 4b86055c..8301c030 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -70,7 +70,7 @@ def test_update_cid_refs_content_multiple(pids, store): def test_update_cid_refs_content_pid_exists(pids, store): - """Test that update_cid_ref does not write pid if pid already exists""" + """Test that update_cid_ref throws exception if pid already exists.""" for pid in pids.keys(): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) @@ -128,15 +128,14 @@ def test_delete_cid_refs_pid_file(pids, store): def test_delete_cid_refs_pid_file_not_empty(pids, store): - """Test that delete_cid_refs_file does not raise an exception when refs file - is not empty.""" + """Test that delete_cid_refs_file raises an exception when refs file is not empty.""" for pid in pids.keys(): cid = pids[pid]["sha256"] cid_ref_abs_path = store.get_refs_abs_path("cid", cid) store.create_path(os.path.dirname(cid_ref_abs_path)) store._write_cid_refs_file(cid_ref_abs_path, pid) - cid_refs_deleted = store._delete_cid_refs_file(cid_ref_abs_path) - assert not cid_refs_deleted + with pytest.raises(OSError): + store._delete_cid_refs_file(cid_ref_abs_path) def test_delete_cid_refs_pid_file_not_found(pids, store): @@ -190,3 +189,70 @@ def test_delete_pid_refs_file_file_not_found(pids, store): pid_ref_abs_path = store.get_refs_abs_path("pid", pid) with pytest.raises(FileNotFoundError): store._delete_cid_refs_file(pid_ref_abs_path) + + +def test_validate_references_pid_refs_file_missing(pids, store): + """Test that validate_references throws exception when pid refs file is missing.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + with pytest.raises(FileNotFoundError): + store._validate_references(pid, cid) + + +def test_validate_references_pid_refs_incorrect_cid(pids, store): + """Test that validate_references throws exception when pid refs file cid is incorrect.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_pid_refs_file(pid_ref_abs_path, "bad_cid") + with pytest.raises(FileNotFoundError): + store._validate_references(pid, cid) + + +def test_validate_references_cid_refs_file_missing(pids, store): + """Test that validate_references throws exception when cid refs file is missing.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_pid_refs_file(pid_ref_abs_path, cid) + with pytest.raises(FileNotFoundError): + store._validate_references(pid, cid) + + +def test_validate_references_cid_refs_file_missing_pid(pids, store): + """Test that validate_references throws exception when cid refs file does not contain + the expected pid.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_pid_refs_file(pid_ref_abs_path, cid) + store._write_cid_refs_file(cid_ref_abs_path, "bad_pid") + with pytest.raises(ValueError): + store._validate_references(pid, cid) + + +def test_validate_references_cid_refs_file_with_multiple_refs_missing_pid(pids, store): + """Test that validate_references throws exception when cid refs file with multiple + references does not contain the expected pid.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_pid_refs_file(pid_ref_abs_path, cid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_cid_refs_file(cid_ref_abs_path, "bad_pid") + + cid_reference_list = [pid] + for i in range(0, 5): + store._update_cid_refs(cid_ref_abs_path, f"dou.test.{i}") + cid_reference_list.append(f"dou.test.{i}") + + with pytest.raises(ValueError): + store._validate_references(pid, cid) From b567d5b0218b658985f07de329eb28c594bc267f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 Nov 2023 09:39:31 -0800 Subject: [PATCH 47/71] Add pytests for 'store_data_only' --- src/hashstore/filehashstore.py | 8 ------ tests/test_filehashstore.py | 48 +++++++++++++++++++++++++++++----- 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 9ebf046e..e4b15d1d 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -501,7 +501,6 @@ def tag_object(self, pid, cid): self.reference_locked_cids.append(cid) try: # TODO: Review process and test what happens when specific pieces fail - # We cannot have a pid ref file whose pid is not referenced in the cid refs file pid_ref_abs_path = self.get_refs_abs_path("pid", pid) cid_ref_abs_path = self.get_refs_abs_path("cid", cid) if os.path.exists(pid_ref_abs_path): @@ -839,13 +838,6 @@ def store_data_only(self, data): """ logging.debug("FileHashStore - store_object: Request to store object.") - # TODO: Missing Tests - # - Test that this method returns hex digests and that they are correct - # - Test that objects are actually stored with their cid - # - Test that exception is raised when object fails to store - # - Test that exception is raised when object already exists - # - Test providing the data as a file path - # - Test providing the data as a stream try: # Ensure the data is a stream stream = Stream(data) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 801257b4..8c448ad4 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -206,7 +206,7 @@ def test_set_default_algorithms_missing_yaml(store, pids): def test_store_and_validate_data_files_path(pids, store): - """Test put objects with path object.""" + """Test store_and_validate_data objects with path object.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -217,7 +217,7 @@ def test_store_and_validate_data_files_path(pids, store): def test_store_and_validate_data_files_string(pids, store): - """Test put objects with string.""" + """Test store_and_validate_data objects with string.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -228,7 +228,7 @@ def test_store_and_validate_data_files_string(pids, store): def test_store_and_validate_data_files_stream(pids, store): - """Test put objects with stream.""" + """Test store_and_validate_data objects with stream.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -242,7 +242,7 @@ def test_store_and_validate_data_files_stream(pids, store): def test_store_and_validate_data_cid(pids, store): - """Check put returns correct id.""" + """Check store_and_validate_data returns correct id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -252,7 +252,7 @@ def test_store_and_validate_data_cid(pids, store): def test_store_and_validate_data_file_size(pids, store): - """Check put returns correct file size.""" + """Check store_and_validate_data returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -262,7 +262,7 @@ def test_store_and_validate_data_file_size(pids, store): def test_store_and_validate_data_hex_digests(pids, store): - """Check put successfully generates hex digests dictionary.""" + """Check store_and_validate_data successfully generates hex digests dictionary.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -303,7 +303,7 @@ def test_store_and_validate_data_with_correct_checksums(pids, store): def test_store_and_validate_data_with_incorrect_checksum(pids, store): - """Check put fails when bad checksum supplied.""" + """Check store_and_validate_data fails when bad checksum supplied.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -317,6 +317,40 @@ def test_store_and_validate_data_with_incorrect_checksum(pids, store): assert store.count(entity) == 0 +def test_store_data_only_cid(pids, store): + """Check store_data_only returns correct id.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_data_only(path) + object_metadata_id = object_metadata.id + assert object_metadata_id == pids[pid][store.algorithm] + + +def test_store_data_only_file_size(pids, store): + """Check store_data_only returns correct file size.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_data_only(path) + object_size = object_metadata.obj_size + assert object_size == pids[pid]["file_size_bytes"] + + +def test_store_data_only_hex_digests(pids, store): + """Check store_data_only generates hex digests dictionary.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_data_only(path) + object_metadata_hex_digests = object_metadata.hex_digests + assert object_metadata_hex_digests.get("md5") == pids[pid]["md5"] + assert object_metadata_hex_digests.get("sha1") == pids[pid]["sha1"] + assert object_metadata_hex_digests.get("sha256") == pids[pid]["sha256"] + assert object_metadata_hex_digests.get("sha384") == pids[pid]["sha384"] + assert object_metadata_hex_digests.get("sha512") == pids[pid]["sha512"] + + def test_move_and_get_checksums_id(pids, store): """Test move returns correct id.""" test_dir = "tests/testdata/" From 348c536c9c0a5f43b9c17bbf8ac837fcc1878461 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 Nov 2023 13:41:27 -0800 Subject: [PATCH 48/71] Clean up comments, code and logging statements --- src/hashstore/filehashstore.py | 55 ++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e4b15d1d..a6efed56 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -823,7 +823,8 @@ def store_and_validate_data( def store_data_only(self, data): """Store an object to HashStore and return the id and a hex digest - dictionary of the default algorithms. + dictionary of the default algorithms. This method does not validate the + object and writes directly to /objects after the hex digests are calculated. Args: data (mixed): String or path to object. @@ -836,7 +837,9 @@ def store_data_only(self, data): object_metadata (ObjectMetadata): object that contains the object id, object file size and hex digest dictionary. """ - logging.debug("FileHashStore - store_object: Request to store object.") + logging.debug( + "FileHashStore - store_object: Request to store data object only." + ) try: # Ensure the data is a stream @@ -902,10 +905,6 @@ def _move_and_get_checksums( Returns: object_metadata (tuple): object id, object file size and hex digest dictionary. """ - # TODO: If the checksum algorithm is the same as the store algorithm, then we can - # determine whether the object exists or not to be efficient - - # Create temporary file and calculate hex digests debug_msg = ( "FileHashStore - _move_and_get_checksums: Creating temp" + f" file and calculating checksums for pid: {pid}" @@ -993,8 +992,8 @@ def _move_and_get_checksums( else: # Else delete temporary file exception_string = ( - f"FileHashStore - _move_and_get_checksums: Object exists at: {abs_file_path}," - + " deleting temporary file." + "FileHashStore - _move_and_get_checksums: Object already exists at:" + + f" {abs_file_path}, deleting temporary file." ) logging.error(exception_string) self.delete(entity, tmp_file_name) @@ -1133,8 +1132,11 @@ def _write_cid_refs_file(self, path, pid): path (string): Path of file to be written into pid (string): Authority-based or persistent identifier of object """ - info_msg = f"FileHashStore - write_cid_refs_file: Writing pid ({pid}) into file: {path}" - logging.info(info_msg) + logging.debug( + "FileHashStore - write_cid_refs_file: Writing pid (%s) into file: %s", + pid, + path, + ) try: with open(path, "w", encoding="utf8") as cid_ref_file: @@ -1160,11 +1162,11 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): cid_ref_abs_path (string): Absolute path to the cid ref file pid (string): Authority-based or persistent identifier of object """ - info_msg = ( - f"FileHashStore - update_cid_refs: Adding pid ({pid}) into cid reference" - + f" file: {cid_ref_abs_path}" + logging.debug( + "FileHashStore - update_cid_refs: Adding pid (%s) into cid reference file: %s", + pid, + cid_ref_abs_path, ) - logging.info(info_msg) try: with open(cid_ref_abs_path, "r", encoding="utf8") as f: @@ -1200,11 +1202,11 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): cid_ref_abs_path (string): Absolute path to the cid ref file pid (string): Authority-based or persistent identifier of object """ - info_msg = ( - f"FileHashStore - delete_cid_refs_pid: Deleting pid ({pid}) from cid reference" - + f" file: {cid_ref_abs_path}" + logging.debug( + "FileHashStore - delete_cid_refs_pid: Deleting pid (%s) from cid reference file: %s", + pid, + cid_ref_abs_path, ) - logging.info(info_msg) try: with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: @@ -1244,11 +1246,10 @@ def _delete_cid_refs_file(self, cid_ref_abs_path): Returns: boolean: True if deleted, False if not """ - info_msg = ( + logging.debug( "FileHashStore - delete_cid_refs_file: Deleting reference file: %s", cid_ref_abs_path, ) - logging.info(info_msg) try: if not os.path.exists(cid_ref_abs_path): @@ -1287,8 +1288,11 @@ def _write_pid_refs_file(self, path, cid): path (string): Path of file to be written into cid (string): Content identifier """ - info_msg = f"FileHashStore - write_pid_refs_file: Writing cid ({cid}) into file: {path}" - logging.info(info_msg) + logging.debug( + "FileHashStore - write_pid_refs_file: Writing cid (%s) into file: %s", + cid, + path, + ) try: with open(path, "w", encoding="utf8") as pid_ref_file: @@ -1313,11 +1317,10 @@ def _delete_pid_refs_file(self, pid_ref_abs_path): Args: pid_ref_abs_path (string): Absolute path to the pid ref file """ - info_msg = ( + logging.debug( "FileHashStore - delete_pid_refs_file: Deleting reference file: %s", pid_ref_abs_path, ) - logging.info(info_msg) try: if not os.path.exists(pid_ref_abs_path): @@ -1551,7 +1554,7 @@ def _validate_object( if file_size_to_validate != tmp_file_size: self.delete(entity, tmp_file_name) exception_string = ( - "FileHashStore - _move_and_get_checksums: Object file size calculated: " + "FileHashStore - _validate_object: Object file size calculated: " + f" {tmp_file_size} does not match with expected size:" + f"{file_size_to_validate}. Tmp file deleted and file not stored for" + f" pid: {pid}" @@ -1563,7 +1566,7 @@ def _validate_object( if hex_digest_stored != checksum: self.delete(entity, tmp_file_name) exception_string = ( - "FileHashStore - _move_and_get_checksums: Hex digest and checksum" + "FileHashStore - _validate_object: Hex digest and checksum" + f" do not match - file not stored for pid: {pid}. Algorithm:" + f" {checksum_algorithm}. Checksum provided: {checksum} !=" + f" HexDigest: {hex_digest_stored}. Tmp file deleted." From 3766155ef3f80541b041d9489ca48835654d2cb0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 Nov 2023 14:00:33 -0800 Subject: [PATCH 49/71] Refactor '_validate_object' method and update docstrings --- src/hashstore/filehashstore.py | 51 +++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index a6efed56..3b54c48d 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1537,42 +1537,55 @@ def _validate_object( tmp_file_size, file_size_to_validate, ): - """Evaluates an object's integrity + """Evaluates an object's integrity and throws exception if there is a mismatch. Args: - pid: For logging purposes - checksum: Value of checksum - checksum_algorithm: Algorithm of checksum - entity: Type of object - hex_digests: Dictionary of hex digests to select from - tmp_file_name: Name of tmp file - tmp_file_size: Size of the tmp file - file_size_to_validate: Expected size of the object + pid (string): For logging purposes + checksum (string): Value of checksum to check + checksum_algorithm (string): Algorithm of checksum + entity (string): Type of object ('objects' or 'metadata') + hex_digests (dictionary): Dictionary of hex digests to parse + tmp_file_name (string): Name of tmp file + tmp_file_size (int): Size of the tmp file + file_size_to_validate (int): Expected size of the object """ - # TODO: Refactor this method and/or create a new method for Metacat client to call if file_size_to_validate is not None and file_size_to_validate > 0: if file_size_to_validate != tmp_file_size: - self.delete(entity, tmp_file_name) exception_string = ( "FileHashStore - _validate_object: Object file size calculated: " + f" {tmp_file_size} does not match with expected size:" - + f"{file_size_to_validate}. Tmp file deleted and file not stored for" - + f" pid: {pid}" + + f"{file_size_to_validate}." ) - logging.error(exception_string) - raise ValueError(exception_string) + if pid is not None: + self.delete(entity, tmp_file_name) + exception_string_for_pid = ( + exception_string + + f" Tmp file deleted and file not stored for pid: {pid}" + ) + logging.error(exception_string_for_pid) + raise ValueError(exception_string_for_pid) + else: + logging.error(exception_string) + raise ValueError(exception_string) if checksum_algorithm is not None and checksum is not None: hex_digest_stored = hex_digests[checksum_algorithm] if hex_digest_stored != checksum: - self.delete(entity, tmp_file_name) exception_string = ( "FileHashStore - _validate_object: Hex digest and checksum" + f" do not match - file not stored for pid: {pid}. Algorithm:" + f" {checksum_algorithm}. Checksum provided: {checksum} !=" - + f" HexDigest: {hex_digest_stored}. Tmp file deleted." + + f" HexDigest: {hex_digest_stored}." ) - logging.error(exception_string) - raise ValueError(exception_string) + if pid is not None: + self.delete(entity, tmp_file_name) + exception_string_for_pid = ( + exception_string + f"Tmp file ({tmp_file_name}) deleted." + ) + logging.error(exception_string_for_pid) + raise ValueError(exception_string_for_pid) + else: + logging.error(exception_string) + raise ValueError(exception_string) def _validate_references(self, pid, cid): """Verifies that the supplied pid and pid reference file and content have been From 3b5275b807b2b4a670dda1ffd9401f0307195cc1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 Nov 2023 14:15:21 -0800 Subject: [PATCH 50/71] Add new method 'verify_object' to allow caller to validate an object's integrity --- src/hashstore/filehashstore.py | 42 +++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3b54c48d..93339569 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -477,6 +477,46 @@ def store_object( return object_metadata + def verify_object( + self, object_metadata, checksum, checksum_algorithm, expected_file_size + ): + """Confirms that a object_metadata's content is equal to the given values. + + Args: + object_metadata (ObjectMetadata): object_metadata object + checksum (string): Value of checksum + checksum_algorithm (string): Algorithm of checksum + expected_file_size (int): Size of the tmp file + """ + logging.debug( + "FileHashStore - verify_object: Called to verify object with id: %s", + object_metadata.id, + ) + self._is_string_none_or_empty(checksum, "checksum", "verify_object") + self._is_string_none_or_empty( + checksum_algorithm, "checksum_algorithm", "verify_object" + ) + self._validate_file_size(expected_file_size) + if object_metadata is None or not isinstance(ObjectMetadata): + raise ValueError( + "FileHashStore - verify_object: 'object_metadata' cannot be None." + + " Must be a 'ObjectMetadata' object." + ) + else: + object_metadata_hex_digests = object_metadata.hex_digests + object_metadata_file_size = object_metadata.obj_size + checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) + self._validate_object( + pid=None, + checksum=checksum, + checksum_algorithm=checksum_algorithm_checked, + entity=None, + hex_digests=object_metadata_hex_digests, + tmp_file_name=None, + tmp_file_size=object_metadata_file_size, + file_size_to_validate=expected_file_size, + ) + def tag_object(self, pid, cid): logging.debug( "FileHashStore - tag_object: Tagging object cid: {%s} with pid: {%s}.", @@ -1994,7 +2034,7 @@ def count(self, entity): @staticmethod def _validate_file_size(file_size): - """Checks whether a file size is > 0 and an int and throws exception if not. + """Checks whether a given argument is an integer and > 0 and throws exception if not. Args: file_size (int): file size to check From f8d142543a751f23c7b1455bd7b97439dbd4876a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 Nov 2023 14:40:41 -0800 Subject: [PATCH 51/71] Clean up code and add TODO items --- src/hashstore/filehashstore.py | 45 +++++++++++++++++--------- tests/test_filehashstore_references.py | 4 +-- 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 93339569..a4c37dbd 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -18,8 +18,7 @@ class FileHashStore(HashStore): """FileHashStore is a content addressable file manager based on Derrick Gilland's 'hashfs' library. It supports the storage of objects on disk using - an authority-based identifier's hex digest with a given hash algorithm value - to address files. + a content identifier to address files. FileHashStore initializes using a given properties dictionary containing the required keys (see Args). Upon initialization, FileHashStore verifies the provided @@ -113,6 +112,7 @@ def __init__(self, properties=None): if not os.path.exists(self.metadata): self.create_path(self.metadata + "/tmp") if not os.path.exists(self.refs): + self.create_path(self.refs + "/tmp") self.create_path(self.refs + "/pid") self.create_path(self.refs + "/cid") logging.debug( @@ -488,6 +488,7 @@ def verify_object( checksum_algorithm (string): Algorithm of checksum expected_file_size (int): Size of the tmp file """ + # TODO: Write tests logging.debug( "FileHashStore - verify_object: Called to verify object with id: %s", object_metadata.id, @@ -540,10 +541,10 @@ def tag_object(self, pid, cid): ) self.reference_locked_cids.append(cid) try: - # TODO: Review process and test what happens when specific pieces fail pid_ref_abs_path = self.get_refs_abs_path("pid", pid) cid_ref_abs_path = self.get_refs_abs_path("cid", cid) if os.path.exists(pid_ref_abs_path): + # A pid reference file can only contain one cid exception_string = ( "FileHashStore - write_pid_refs_file: pid ref file already exists for %s", pid_ref_abs_path, @@ -551,13 +552,12 @@ def tag_object(self, pid, cid): logging.error(exception_string) raise FileExistsError(exception_string) elif os.path.exists(cid_ref_abs_path): - # If it does, read the file and add the new pid on its own line + # Update cid ref files if it already exists self.update_cid_refs(cid_ref_abs_path, pid) else: # All ref files begin as tmp files and get moved sequentially at once # Ensure refs tmp folder exists tmp_root_path = self.get_store_path("refs") / "tmp" - # Physically create directory if it doesn't exist if os.path.exists(tmp_root_path) is False: self.create_path(tmp_root_path) @@ -565,7 +565,7 @@ def tag_object(self, pid, cid): pid_tmp_file = self._mktmpfile(tmp_root_path) pid_tmp_file_path = pid_tmp_file.name self._write_pid_refs_file(pid_tmp_file_path, cid) - # Then write cid_refs_file content into tmp file + # Then write cid_refs_file content into another tmp file cid_tmp_file = self._mktmpfile(tmp_root_path) cid_tmp_file_path = cid_tmp_file.name self._write_cid_refs_file(cid_tmp_file_path, pid) @@ -577,6 +577,8 @@ def tag_object(self, pid, cid): # Move both files shutil.move(pid_tmp_file_path, pid_ref_abs_path) shutil.move(cid_tmp_file_path, cid_ref_abs_path) + # Ensure that the reference files have been written as expected + # If there is an issue, client or user will have to manually review self._validate_references(pid, cid) return True finally: @@ -967,9 +969,10 @@ def _move_and_get_checksums( object_cid = hex_digests.get(self.algorithm) abs_file_path = self.build_abs_path(entity, object_cid, extension) - # Only move file if it doesn't exist. - # Files are stored once and only once + # Only move file if it doesn't exist. We do not check before we create the tmp + # file and calculate the hex digests because the given checksum could be incorrect. if not os.path.isfile(abs_file_path): + # Files are stored once and only once self._validate_object( pid, checksum, @@ -1164,9 +1167,8 @@ def delete_tmp_file(): return tmp def _write_cid_refs_file(self, path, pid): - """Write the reference file in the supplied path for the given content - identifier (cid). A reference file contains every pid that references a - cid each on its own line. + """Write the cid reference file in the supplied path. A reference file contains + every pid that references a cid each on its own line. Args: path (string): Path of file to be written into @@ -1178,6 +1180,11 @@ def _write_cid_refs_file(self, path, pid): path, ) + # TODO: Check that the given path does not contain any data before writing + # This method only writes a new cid refs file and should not overwrite + # an existing one. + # TODO: Write test to confirm exception is thrown when path contains data + try: with open(path, "w", encoding="utf8") as cid_ref_file: fcntl.flock(cid_ref_file, fcntl.LOCK_EX) @@ -1208,6 +1215,10 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): cid_ref_abs_path, ) + # TODO: Throw exception if the file doesn't exist. This method should only + # proceed when there is an existing cid refs file. + # TODO: Write test to check for exception thrown + try: with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): @@ -1319,10 +1330,8 @@ def _delete_cid_refs_file(self, cid_ref_abs_path): raise err def _write_pid_refs_file(self, path, cid): - """Write the reference file in the supplied path for the given pid (persistent + """Write the pid reference file in the supplied path for the given cid (content identifier). A reference file for a pid contains the cid that it references. - Its permanent address is the pid hash using HashStore's default store algorithm - and follows its directory structure. Args: path (string): Path of file to be written into @@ -1988,7 +1997,11 @@ def build_abs_path(self, entity, hash_id, extension=""): return absolute_path def get_refs_abs_path(self, ref_type, hash_id): - """Get the absolute path to the reference file for the given pid. + """Get the absolute path to the reference file for the given ref_type. If a + 'pid' is provided, this method will calculate the pid's hash based on the store + algorithm, and return the expected address of the pid reference file. If a + 'cid' is provided, this method will return the expected address by sharding the + cid based on HashStore's configuration. Args: ref_type (string): 'pid' or 'cid' @@ -2060,7 +2073,7 @@ def _is_string_none_or_empty(string, arg, method): Args: string (string): Value to check - arg (): Name of argument to check + arg (string): Name of argument to check method (string): Calling method for logging purposes """ if string is None or string.replace(" ", "") == "": diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 8301c030..9051d9e3 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -127,7 +127,7 @@ def test_delete_cid_refs_pid_file(pids, store): assert not os.path.exists(cid_ref_abs_path) -def test_delete_cid_refs_pid_file_not_empty(pids, store): +def test_delete_cid_refs_file_file_not_empty(pids, store): """Test that delete_cid_refs_file raises an exception when refs file is not empty.""" for pid in pids.keys(): cid = pids[pid]["sha256"] @@ -138,7 +138,7 @@ def test_delete_cid_refs_pid_file_not_empty(pids, store): store._delete_cid_refs_file(cid_ref_abs_path) -def test_delete_cid_refs_pid_file_not_found(pids, store): +def test_delete_cid_refs_file_file_not_found(pids, store): """Test that delete_cid_refs_file raises an exception when refs file not found.""" for pid in pids.keys(): cid = pids[pid]["sha256"] From 68ba2a781bc0d8ccaaf48a939869e1ac8324609f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 16 Nov 2023 10:53:18 -0800 Subject: [PATCH 52/71] Clean-up test modules' comments and docstrings and move 'tag_object' test methods to 'test_filehashstore_references' --- src/hashstore/hashstore.py | 2 +- tests/test_filehashstore.py | 2 +- tests/test_filehashstore_interface.py | 93 +------------------------- tests/test_filehashstore_references.py | 93 +++++++++++++++++++++++++- tests/test_filehashstore_stream.py | 2 +- tests/test_hashstore.py | 8 +-- tests/test_hashstore_client.py | 2 +- 7 files changed, 101 insertions(+), 101 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 130c1304..b1851d0e 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -33,7 +33,7 @@ def store_object( The file's id is determined by calculating the object's content identifier based on the store's default algorithm, which is also used as the permanent address of the file. - The file's identifier is then sharded using a depth of 3 and width of 2, + The file's identifier is then sharded using the store's configured depth and width, delimited by '/' and concatenated to produce the final permanent address and is stored in the `/store_directory/objects/` directory. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 8c448ad4..59c8b1ac 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1,4 +1,4 @@ -"""Test module for FileHashStore core, utility and supporting methods""" +"""Test module for FileHashStore core, utility and supporting methods.""" import io import os from pathlib import Path diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index afc63d53..566849da 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -1,4 +1,4 @@ -"""Test module for FileHashStore HashStore interface methods""" +"""Test module for FileHashStore HashStore interface methods.""" import io import os from pathlib import Path @@ -521,97 +521,6 @@ def test_store_object_sparse_large_file(store): assert object_metadata_id == object_metadata.hex_digests.get("sha256") -def test_tag_object(pids, store): - """Test tag object returns boolean.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - object_tagged = store.tag_object(pid, object_metadata.id) - assert object_tagged - - -def test_tag_object_pid_refs_file(pids, store): - """Test tag object creates the pid reference file.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) - pid_refs_file_path = store.get_refs_abs_path("pid", pid) - assert os.path.exists(pid_refs_file_path) - - -def test_tag_object_pid_refs_file_exists(pids, store): - """Test tag object throws exception when pid refs file already exists.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - cid = object_metadata.id - store.tag_object(pid, cid) - pid_refs_file_path = store.get_refs_abs_path("pid", pid) - assert os.path.exists(pid_refs_file_path) - cid_refs_file_path = store.get_refs_abs_path("cid", cid) - assert os.path.exists(cid_refs_file_path) - with pytest.raises(FileExistsError): - store.tag_object(pid, cid) - - -def test_tag_object_pid_refs_file_content(pids, store): - """Test tag object creates the pid reference file contains the correct cid.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) - pid_refs_file_path = store.get_refs_abs_path("pid", pid) - with open(pid_refs_file_path, "r", encoding="utf8") as f: - pid_refs_cid = f.read() - assert pid_refs_cid == object_metadata.id - - -def test_tag_object_cid_refs_file(pids, store): - """Test tag object creates the cid reference file.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - cid = object_metadata.id - store.tag_object(pid, object_metadata.id) - cid_refs_file_path = store.get_refs_abs_path("cid", cid) - assert os.path.exists(cid_refs_file_path) - - -def test_tag_object_cid_refs_file_content(pids, store): - """Test tag object tags cid reference file successfully with pid.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) - cid_refs_file_path = store.get_refs_abs_path("cid", object_metadata.id) - with open(cid_refs_file_path, "r", encoding="utf8") as f: - pid_refs_cid = f.read().strip() - assert pid_refs_cid == pid - - -def test_tag_object_cid_refs_file_exists(pids, store): - """Test tag object raises exception when trying to add another cid to an - existing pid reference file and that a cid reference file is not created.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) - another_cid = "dou.test.1" - with pytest.raises(FileExistsError): - store.tag_object(pid, another_cid) - - second_cid_hash = store.get_refs_abs_path("cid", another_cid) - assert not os.path.exists(second_cid_hash) - - def test_find_object(pids, store): """Test find object returns the correct content identifier (cid).""" test_dir = "tests/testdata/" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 9051d9e3..9a4fc061 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -1,10 +1,101 @@ -"""Test module for FileHashStore core, utility and supporting methods""" +"""Test module for FileHashStore's reference system to tag stored objects.""" import os import pytest # pylint: disable=W0212 +def test_tag_object(pids, store): + """Test tag object returns boolean.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + object_tagged = store.tag_object(pid, object_metadata.id) + assert object_tagged + + +def test_tag_object_pid_refs_file(pids, store): + """Test tag object creates the pid reference file.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + assert os.path.exists(pid_refs_file_path) + + +def test_tag_object_pid_refs_file_exists(pids, store): + """Test tag object throws exception when pid refs file already exists.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.id + store.tag_object(pid, cid) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + assert os.path.exists(pid_refs_file_path) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert os.path.exists(cid_refs_file_path) + with pytest.raises(FileExistsError): + store.tag_object(pid, cid) + + +def test_tag_object_pid_refs_file_content(pids, store): + """Test tag object creates the pid reference file contains the correct cid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + with open(pid_refs_file_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + assert pid_refs_cid == object_metadata.id + + +def test_tag_object_cid_refs_file(pids, store): + """Test tag object creates the cid reference file.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.id + store.tag_object(pid, object_metadata.id) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert os.path.exists(cid_refs_file_path) + + +def test_tag_object_cid_refs_file_content(pids, store): + """Test tag object tags cid reference file successfully with pid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + cid_refs_file_path = store.get_refs_abs_path("cid", object_metadata.id) + with open(cid_refs_file_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read().strip() + assert pid_refs_cid == pid + + +def test_tag_object_cid_refs_file_exists(pids, store): + """Test tag object raises exception when trying to add another cid to an + existing pid reference file and that a cid reference file is not created.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + another_cid = "dou.test.1" + with pytest.raises(FileExistsError): + store.tag_object(pid, another_cid) + + second_cid_hash = store.get_refs_abs_path("cid", another_cid) + assert not os.path.exists(second_cid_hash) + + def test_write_cid_refs_file(pids, store): """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): diff --git a/tests/test_filehashstore_stream.py b/tests/test_filehashstore_stream.py index 8cf4a7d0..94e6c412 100644 --- a/tests/test_filehashstore_stream.py +++ b/tests/test_filehashstore_stream.py @@ -1,4 +1,4 @@ -"""Test module for Stream""" +"""Test module for FileHashStore's Stream class.""" import hashlib import io from pathlib import Path diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index 68cd195a..953e0fac 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -1,4 +1,4 @@ -"""Test module for HashStore Module""" +"""Test module for HashStore's HashStoreFactory and ObjectMetadata class.""" import os import pytest from hashstore.hashstore import ObjectMetadata, HashStoreFactory @@ -43,7 +43,7 @@ def test_factory_get_hashstore_unsupported_module(factory): def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): - """Check factory raises exception with store algorithm value that part of the default list""" + """Check factory raises exception with store algorithm value that part of the default list.""" module_name = "hashstore.filehashstore" class_name = "FileHashStore" @@ -59,7 +59,7 @@ def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory): - """Check factory raises exception with incorrectly formatted algorithm value""" + """Check factory raises exception with incorrectly formatted algorithm value.""" module_name = "hashstore.filehashstore" class_name = "FileHashStore" @@ -75,7 +75,7 @@ def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory) def test_objectmetadata(): - """Test class returns correct values via dot notation.""" + """Test ObjectMetadata class returns correct values via dot notation.""" ab_id = "hashstoretest" obj_size = 1234 hex_digest_dict = { diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 7d1e01a0..d7ec6324 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -1,4 +1,4 @@ -"""Test module for the Python client (Public API calls only)""" +"""Test module for the Python client (Public API calls only).""" import sys import os from pathlib import Path From 511e3e65b95600327a9dc0d00cb7bcfcd7630268 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 16 Nov 2023 12:36:02 -0800 Subject: [PATCH 53/71] Revise '_update_cid_refs' and add new pytest to throw exception if file is not found --- src/hashstore/filehashstore.py | 11 +++++++---- tests/test_filehashstore_references.py | 9 +++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index a4c37dbd..2432d435 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1214,10 +1214,13 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): pid, cid_ref_abs_path, ) - - # TODO: Throw exception if the file doesn't exist. This method should only - # proceed when there is an existing cid refs file. - # TODO: Write test to check for exception thrown + if not os.path.exists(cid_ref_abs_path): + exception_string = ( + f"FileHashStore - update_cid_refs: {cid_ref_abs_path} does not exist." + + f" Cannot write pid: {[pid]}" + ) + logging.error(exception_string) + raise FileNotFoundError(exception_string) try: with open(cid_ref_abs_path, "r", encoding="utf8") as f: diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 9a4fc061..6a1afc07 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -171,6 +171,15 @@ def test_update_cid_refs_content_pid_exists(pids, store): store._update_cid_refs(cid_ref_abs_path, pid) +def test_update_cid_refs_content_cid_refs_does_not_exist(pids, store): + """Test that update_cid_ref throws exception if cid refs file doesn't exist.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + with pytest.raises(FileNotFoundError): + store._update_cid_refs(cid_ref_abs_path, pid) + + def test_delete_cid_refs_pid(pids, store): """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" for pid in pids.keys(): From 2c4a1bdc44549320d806e6950890d11be6ba291c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 16 Nov 2023 12:54:11 -0800 Subject: [PATCH 54/71] Rename '_validate_file_size' to '_is_int_and_non_negative' for accuracy --- src/hashstore/filehashstore.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 2432d435..a3abc1ac 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -424,7 +424,7 @@ def store_object( # Validate input parameters self._is_string_none_or_empty(pid, "pid", "store_object") self._validate_data_to_store(data) - self._validate_file_size(expected_object_size) + self._is_int_and_non_negative(expected_object_size) ( additional_algorithm_checked, checksum_algorithm_checked, @@ -497,7 +497,7 @@ def verify_object( self._is_string_none_or_empty( checksum_algorithm, "checksum_algorithm", "verify_object" ) - self._validate_file_size(expected_file_size) + self._is_int_and_non_negative(expected_file_size) if object_metadata is None or not isinstance(ObjectMetadata): raise ValueError( "FileHashStore - verify_object: 'object_metadata' cannot be None." @@ -2049,7 +2049,7 @@ def count(self, entity): # Other Static Methods @staticmethod - def _validate_file_size(file_size): + def _is_int_and_non_negative(file_size): """Checks whether a given argument is an integer and > 0 and throws exception if not. Args: @@ -2058,14 +2058,14 @@ def _validate_file_size(file_size): if file_size is not None: if not isinstance(file_size, int): exception_string = ( - "FileHashStore - _is_file_size_valid: size given must be an integer." + "FileHashStore - _is_int_and_non_negative: size given must be an integer." + f" File size: {file_size}. Arg Type: {type(file_size)}." ) logging.error(exception_string) raise TypeError(exception_string) if file_size < 1 or not isinstance(file_size, int): exception_string = ( - "FileHashStore - _is_file_size_valid: size given must be > 0" + "FileHashStore - _is_int_and_non_negative: size given must be > 0" ) logging.error(exception_string) raise ValueError(exception_string) From 38a1a3cedc67c5f76160569990ea521544f5ecec Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 16 Nov 2023 15:36:52 -0800 Subject: [PATCH 55/71] Update and add new pytests for '_write_cid_refs_file' method --- src/hashstore/filehashstore.py | 41 +++++++++++++++----------- tests/test_filehashstore_references.py | 23 +++++++++++++++ 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index a3abc1ac..492e4f34 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1167,8 +1167,9 @@ def delete_tmp_file(): return tmp def _write_cid_refs_file(self, path, pid): - """Write the cid reference file in the supplied path. A reference file contains - every pid that references a cid each on its own line. + """Write the cid reference file in the supplied path to a file. A reference file + contains every pid that references a cid each on its own line. This method will + only write into an empty file, and will not write over an an existing one. Args: path (string): Path of file to be written into @@ -1180,10 +1181,14 @@ def _write_cid_refs_file(self, path, pid): path, ) - # TODO: Check that the given path does not contain any data before writing - # This method only writes a new cid refs file and should not overwrite - # an existing one. - # TODO: Write test to confirm exception is thrown when path contains data + if os.path.isfile(path): + if os.path.getsize(path) != 0: + err_msg = ( + "FileHashStore - _write_cid_refs_file: Failed to write cid reference file." + + f" File is not empty: {path} " + ) + logging.error(err_msg) + raise OSError(err_msg) try: with open(path, "w", encoding="utf8") as cid_ref_file: @@ -1257,7 +1262,7 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): pid (string): Authority-based or persistent identifier of object """ logging.debug( - "FileHashStore - delete_cid_refs_pid: Deleting pid (%s) from cid reference file: %s", + "FileHashStore - _delete_cid_refs_pid: Deleting pid (%s) from cid reference file: %s", pid, cid_ref_abs_path, ) @@ -1270,7 +1275,7 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): if pid not in cid_ref_file_content: err_msg = ( - f"FileHashStore - delete_cid_refs_pid: pid ({pid}) does not exist in" + f"FileHashStore - _delete_cid_refs_pid: pid ({pid}) does not exist in" + f" cid reference file: {cid_ref_abs_path} " ) raise ValueError(err_msg) @@ -1285,7 +1290,7 @@ def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): except Exception as err: exception_string = ( - "FileHashStore - delete_cid_refs_pid: failed to update reference for cid:" + "FileHashStore - _delete_cid_refs_pid: failed to update reference for cid:" + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) @@ -1301,21 +1306,21 @@ def _delete_cid_refs_file(self, cid_ref_abs_path): boolean: True if deleted, False if not """ logging.debug( - "FileHashStore - delete_cid_refs_file: Deleting reference file: %s", + "FileHashStore - _delete_cid_refs_file: Deleting reference file: %s", cid_ref_abs_path, ) try: if not os.path.exists(cid_ref_abs_path): err_msg = ( - "FileHashStore - delete_cid_refs_file: Cid reference file not found: %s", + "FileHashStore - _delete_cid_refs_file: Cid reference file not found: %s", cid_ref_abs_path, ) logging.error(err_msg) raise FileNotFoundError(err_msg) if os.path.getsize(cid_ref_abs_path) != 0: err_msg = ( - "FileHashStore - delete_cid_refs_file: Failed to delete cid reference file." + "FileHashStore - _delete_cid_refs_file: Failed to delete cid reference file." + f" File is not empty: {cid_ref_abs_path} " ) logging.error(err_msg) @@ -1326,7 +1331,7 @@ def _delete_cid_refs_file(self, cid_ref_abs_path): except Exception as err: exception_string = ( - "FileHashStore - delete_cid_refs_file: failed to delete reference file:" + "FileHashStore - _delete_cid_refs_file: failed to delete reference file:" + f" {cid_ref_abs_path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) @@ -1341,7 +1346,7 @@ def _write_pid_refs_file(self, path, cid): cid (string): Content identifier """ logging.debug( - "FileHashStore - write_pid_refs_file: Writing cid (%s) into file: %s", + "FileHashStore - _write_pid_refs_file: Writing cid (%s) into file: %s", cid, path, ) @@ -1357,7 +1362,7 @@ def _write_pid_refs_file(self, path, cid): except Exception as err: exception_string = ( - f"FileHashStore - write_pid_refs_file: failed to write cid ({cid})" + f"FileHashStore - _write_pid_refs_file: failed to write cid ({cid})" + f" into path: {path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) @@ -1370,14 +1375,14 @@ def _delete_pid_refs_file(self, pid_ref_abs_path): pid_ref_abs_path (string): Absolute path to the pid ref file """ logging.debug( - "FileHashStore - delete_pid_refs_file: Deleting reference file: %s", + "FileHashStore - _delete_pid_refs_file: Deleting reference file: %s", pid_ref_abs_path, ) try: if not os.path.exists(pid_ref_abs_path): err_msg = ( - "FileHashStore - delete_pid_refs_file: pid reference file not found: %s", + "FileHashStore - _delete_pid_refs_file: pid reference file not found: %s", pid_ref_abs_path, ) raise FileNotFoundError(err_msg) @@ -1387,7 +1392,7 @@ def _delete_pid_refs_file(self, pid_ref_abs_path): except Exception as err: exception_string = ( - "FileHashStore - delete_pid_refs_file: failed to delete reference file:" + "FileHashStore - _delete_pid_refs_file: failed to delete reference file:" + f" {pid_ref_abs_path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 6a1afc07..1f4b4b2e 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -120,6 +120,29 @@ def test_write_cid_refs_file_content(pids, store): assert pid == cid_ref_file_pid.strip() +def test_write_cid_refs_file_into_empty_file(pids, store): + """Test that write_cid_reference writes an empty file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + with open(cid_ref_abs_path, "w", encoding="utf8"): + pass + store._write_cid_refs_file(cid_ref_abs_path, pid) + assert os.path.exists(cid_ref_abs_path) + + +def test_write_cid_refs_file_file_not_empty(pids, store): + """Test that write_cid_reference does not overwrite an existing file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store._write_cid_refs_file(cid_ref_abs_path, pid) + with pytest.raises(OSError): + store._write_cid_refs_file(cid_ref_abs_path, "other_pid") + + def test_update_cid_refs_content(pids, store): """Test that update_cid_ref updates the ref file as expected.""" for pid in pids.keys(): From a9cd611411f6a25d75209f32e9a31a5374ed9fa6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 17 Nov 2023 11:47:49 -0800 Subject: [PATCH 56/71] Move info logging statements in finally blocks into try block --- src/hashstore/filehashstore.py | 47 +++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 492e4f34..5cce2793 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -453,6 +453,10 @@ def store_object( ) if pid is None: object_metadata = self.store_data_only(data) + logging.info( + "FileHashStore - store_object: Successfully stored object for cid: %s", + object_metadata.id, + ) else: object_metadata = self.store_and_validate_data( pid, @@ -462,6 +466,10 @@ def store_object( checksum_algorithm=checksum_algorithm_checked, file_size_to_validate=expected_object_size, ) + logging.info( + "FileHashStore - store_object: Successfully stored object for pid: %s", + pid, + ) finally: # Release pid with self.object_lock: @@ -470,10 +478,6 @@ def store_object( pid, ) self.object_locked_pids.remove(pid) - logging.info( - "FileHashStore - store_object: Successfully stored object for pid: %s", - pid, - ) return object_metadata @@ -499,10 +503,12 @@ def verify_object( ) self._is_int_and_non_negative(expected_file_size) if object_metadata is None or not isinstance(ObjectMetadata): - raise ValueError( + exception_string = ( "FileHashStore - verify_object: 'object_metadata' cannot be None." + " Must be a 'ObjectMetadata' object." ) + logging.error(exception_string) + raise ValueError(exception_string) else: object_metadata_hex_digests = object_metadata.hex_digests object_metadata_file_size = object_metadata.obj_size @@ -580,6 +586,12 @@ def tag_object(self, pid, cid): # Ensure that the reference files have been written as expected # If there is an issue, client or user will have to manually review self._validate_references(pid, cid) + + info_msg = ( + f"FileHashStore - tag_object: Successfully tagged cid: {cid}" + + f" with pid: {pid}" + ) + logging.info(info_msg) return True finally: # Release cid @@ -589,8 +601,6 @@ def tag_object(self, pid, cid): cid, ) self.reference_locked_cids.remove(cid) - info_msg = f"FileHashStore - tag_object: Successfully tagged cid: {cid} with pid: {pid}" - logging.info(info_msg) def find_object(self, pid): logging.debug( @@ -643,6 +653,12 @@ def store_metadata(self, pid, metadata, format_id=None): pid, ) metadata_cid = self.put_metadata(metadata, pid, checked_format_id) + + logging.info( + "FileHashStore - store_metadata: Successfully stored metadata for pid: %s", + pid, + ) + return metadata_cid finally: # Release pid with self.metadata_lock: @@ -651,12 +667,6 @@ def store_metadata(self, pid, metadata, format_id=None): pid, ) self.metadata_locked_pids.remove(pid) - logging.info( - "FileHashStore - store_metadata: Successfully stored metadata for pid: %s", - pid, - ) - - return metadata_cid def retrieve_object(self, pid): logging.debug( @@ -745,6 +755,12 @@ def delete_object(self, pid): if cid_refs_deleted: entity = "objects" self.delete(entity, cid) + + info_msg = ( + "FileHashStore - delete_object: Successfully deleted references and/or" + + f" objects associated with pid: {pid}" + ) + logging.info(info_msg) return True finally: # Release cid @@ -754,11 +770,6 @@ def delete_object(self, pid): cid, ) self.reference_locked_cids.remove(cid) - info_msg = ( - "FileHashStore - delete_object: Successfully deleted references and/or" - + f" objects associated with pid: {pid}" - ) - logging.info(info_msg) def delete_metadata(self, pid, format_id=None): logging.debug( From 19695f0ace1aa1875674b196d8709372d69d5ccb Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 17 Nov 2023 12:36:42 -0800 Subject: [PATCH 57/71] Fix bug in 'tag_object', add new pytest and revise logging statements --- src/hashstore/filehashstore.py | 23 ++++++++++++++--------- tests/test_filehashstore_references.py | 16 ++++++++++++++++ 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 5cce2793..60810ffe 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -559,7 +559,12 @@ def tag_object(self, pid, cid): raise FileExistsError(exception_string) elif os.path.exists(cid_ref_abs_path): # Update cid ref files if it already exists - self.update_cid_refs(cid_ref_abs_path, pid) + self._update_cid_refs(cid_ref_abs_path, pid) + logging.info( + "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", + cid, + pid, + ) else: # All ref files begin as tmp files and get moved sequentially at once # Ensure refs tmp folder exists @@ -587,11 +592,11 @@ def tag_object(self, pid, cid): # If there is an issue, client or user will have to manually review self._validate_references(pid, cid) - info_msg = ( - f"FileHashStore - tag_object: Successfully tagged cid: {cid}" - + f" with pid: {pid}" + logging.info( + "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", + cid, + pid, ) - logging.info(info_msg) return True finally: # Release cid @@ -756,11 +761,11 @@ def delete_object(self, pid): entity = "objects" self.delete(entity, cid) - info_msg = ( + info_string = ( "FileHashStore - delete_object: Successfully deleted references and/or" + f" objects associated with pid: {pid}" ) - logging.info(info_msg) + logging.info(info_string) return True finally: # Release cid @@ -809,11 +814,11 @@ def get_hex_digest(self, pid, algorithm): cid_stream = self.open(entity, object_cid) hex_digest = self.computehash(cid_stream, algorithm=algorithm) - info_msg = ( + info_string = ( f"FileHashStore - get_hex_digest: Successfully calculated hex digest for pid: {pid}." + f" Hex Digest: {hex_digest}", ) - logging.info(info_msg) + logging.info(info_string) return hex_digest # FileHashStore Core Methods diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 1f4b4b2e..3cfda1a9 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -96,6 +96,22 @@ def test_tag_object_cid_refs_file_exists(pids, store): assert not os.path.exists(second_cid_hash) +def test_tag_object_cid_refs_update(pids, store): + """Test tag object updates a cid reference file that already exists.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.id + store.tag_object(pid, cid) + store.tag_object("dou.test.1", cid) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + cid_ref_file_pid = f.read() + + assert "dou.test.1" in cid_ref_file_pid + + def test_write_cid_refs_file(pids, store): """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): From 5c9d22ff7bc1d2529853db83d1439c4a186980de Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 17 Nov 2023 14:04:02 -0800 Subject: [PATCH 58/71] Update HashStore interface docstring for 'store_object' --- src/hashstore/hashstore.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index b1851d0e..606a2496 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -27,9 +27,10 @@ def store_object( """The `store_object` method is responsible for the atomic storage of objects to disk using a given stream. Upon successful storage, the method returns a ObjectMetadata object containing relevant file information, such as the file's id (which can be - used to locate the object on disk), the file's size, and a hex digest map of algorithms + used to locate the object on disk), the file's size, and a hex digest dict of algorithms and checksums. `store_object` also ensures that an object is stored only once by - synchronizing multiple calls and rejecting calls to store duplicate objects. + synchronizing multiple calls and rejecting calls to store duplicate objects. Lastly, + it should call `tag_object` to create the references to allow the object to be found. The file's id is determined by calculating the object's content identifier based on the store's default algorithm, which is also used as the permanent address of the file. @@ -38,17 +39,20 @@ def store_object( and is stored in the `/store_directory/objects/` directory. By default, the hex digest map includes the following hash algorithms: - Default algorithms and hex digests to return: md5, sha1, sha256, sha384, sha512, - which are the most commonly used algorithms in dataset submissions to DataONE - and the Arctic Data Center. If an additional algorithm is provided, the - `store_object` method checks if it is supported and adds it to the map along - with its corresponding hex digest. An algorithm is considered "supported" if it - is recognized as a valid hash algorithm in the `hashlib` library. - - Similarly, if a file size and/or checksum & checksumAlgorithm value are provided, + md5, sha1, sha256, sha384, sha512 - which are the most commonly used algorithms in + dataset submissions to DataONE and the Arctic Data Center. If an additional algorithm + is provided, the `store_object` method checks if it is supported and adds it to the + hex digests dict along with its corresponding hex digest. An algorithm is considered + "supported" if it is recognized as a valid hash algorithm in the `hashlib` library. + + Similarly, if a file size and/or checksum & checksum_algorithm value are provided, `store_object` validates the object to ensure it matches the given arguments before moving the file to its permanent address. + Note, calling `store_object` is a possibility, but should only store the object + without calling `tag_object`. It is the caller's responsibility to finalize the + process by calling `tag_object` after veriftying the correct object is stored. + Args: pid (string): Authority-based identifier. data (mixed): String or path to object. From b8d9715034272f7b4f52e328d9b4c586844279bd Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 17 Nov 2023 15:58:19 -0800 Subject: [PATCH 59/71] Initial refactor to 'store_object', fixed bug in 'verify_object' and add new pytests --- src/hashstore/filehashstore.py | 146 ++++++++++++++----------- tests/test_filehashstore.py | 99 +++++++++++++++++ tests/test_filehashstore_interface.py | 9 -- tests/test_filehashstore_references.py | 16 +++ 4 files changed, 197 insertions(+), 73 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 60810ffe..3b7db430 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -418,46 +418,47 @@ def store_object( checksum_algorithm=None, expected_object_size=None, ): - logging.debug( - "FileHashStore - store_object: Request to store object for pid: %s", pid - ) - # Validate input parameters - self._is_string_none_or_empty(pid, "pid", "store_object") - self._validate_data_to_store(data) - self._is_int_and_non_negative(expected_object_size) - ( - additional_algorithm_checked, - checksum_algorithm_checked, - ) = self._validate_algorithms_and_checksum( - additional_algorithm, checksum, checksum_algorithm - ) - - # Wait for the pid to release if it's in use - while pid in self.object_locked_pids: - logging.debug( - "FileHashStore - store_object: %s is currently being stored. Waiting.", - pid, + if pid is None and self._validate_data_to_store(data): + logging.debug("FileHashStore - store_object: Request to store data only.") + object_metadata = self.store_data_only(data) + logging.info( + "FileHashStore - store_object: Successfully stored object for cid: %s", + object_metadata.id, ) - time.sleep(self.time_out_sec) - # Modify object_locked_pids consecutively - with self.object_lock: + else: logging.debug( - "FileHashStore - store_object: Adding pid: %s to object_locked_pids.", - pid, + "FileHashStore - store_object: Request to store object for pid: %s", pid ) - self.object_locked_pids.append(pid) - try: - logging.debug( - "FileHashStore - store_object: Attempting to store object for pid: %s", - pid, + # Validate input parameters + self._is_string_none_or_empty(pid, "pid", "store_object") + self._validate_data_to_store(data) + self._is_int_and_non_negative(expected_object_size) + ( + additional_algorithm_checked, + checksum_algorithm_checked, + ) = self._validate_algorithms_and_checksum( + additional_algorithm, checksum, checksum_algorithm ) - if pid is None: - object_metadata = self.store_data_only(data) - logging.info( - "FileHashStore - store_object: Successfully stored object for cid: %s", - object_metadata.id, + + # Wait for the pid to release if it's in use + while pid in self.object_locked_pids: + logging.debug( + "FileHashStore - store_object: %s is currently being stored. Waiting.", + pid, + ) + time.sleep(self.time_out_sec) + # Modify object_locked_pids consecutively + with self.object_lock: + logging.debug( + "FileHashStore - store_object: Adding pid: %s to object_locked_pids.", + pid, + ) + self.object_locked_pids.append(pid) + try: + logging.debug( + "FileHashStore - store_object: Attempting to store object for pid: %s", + pid, ) - else: object_metadata = self.store_and_validate_data( pid, data, @@ -466,18 +467,19 @@ def store_object( checksum_algorithm=checksum_algorithm_checked, file_size_to_validate=expected_object_size, ) + # TODO: Tag object afterwards and fix pytests logging.info( "FileHashStore - store_object: Successfully stored object for pid: %s", pid, ) - finally: - # Release pid - with self.object_lock: - logging.debug( - "FileHashStore - store_object: Removing pid: %s from object_locked_pids.", - pid, - ) - self.object_locked_pids.remove(pid) + finally: + # Release pid + with self.object_lock: + logging.debug( + "FileHashStore - store_object: Removing pid: %s from object_locked_pids.", + pid, + ) + self.object_locked_pids.remove(pid) return object_metadata @@ -502,7 +504,7 @@ def verify_object( checksum_algorithm, "checksum_algorithm", "verify_object" ) self._is_int_and_non_negative(expected_file_size) - if object_metadata is None or not isinstance(ObjectMetadata): + if object_metadata is None or not isinstance(object_metadata, ObjectMetadata): exception_string = ( "FileHashStore - verify_object: 'object_metadata' cannot be None." + " Must be a 'ObjectMetadata' object." @@ -523,6 +525,10 @@ def verify_object( tmp_file_size=object_metadata_file_size, file_size_to_validate=expected_file_size, ) + logging.info( + "FileHashStore - verify_object: object has been validated for cid: %s", + object_metadata.id, + ) def tag_object(self, pid, cid): logging.debug( @@ -1024,12 +1030,12 @@ def _move_and_get_checksums( pid_checksum = self.get_hex_digest(pid, self.algorithm) if pid_checksum == hex_digests.get(self.algorithm): # If the checksums match, return and log warning - warning_msg = ( + exception_string = ( "FileHashStore - _move_and_get_checksums: File moved" + f" successfully but unexpected issue encountered: {exception_string}", ) - logging.warning(warning_msg) - return + logging.error(exception_string) + raise err else: debug_msg = ( "FileHashStore - _move_and_get_checksums: Permanent file" @@ -1513,6 +1519,9 @@ def _validate_data_to_store(self, data): Args: data (string, path, stream): object to validate + + Returns: + boolean: True if valid. """ if ( not isinstance(data, str) @@ -1532,12 +1541,13 @@ def _validate_data_to_store(self, data): ) logging.error(exception_string) raise TypeError(exception_string) + return True def _validate_algorithms_and_checksum( self, additional_algorithm, checksum, checksum_algorithm ): - """Determines whether calling app has supplied the necessary arguments to validate - an object with a checksum value + """Determines whether caller has supplied the necessary arguments to validate + an object with a checksum value. Args: additional_algorithm: value of additional algorithm to calculate @@ -1641,24 +1651,32 @@ def _validate_object( logging.error(exception_string) raise ValueError(exception_string) if checksum_algorithm is not None and checksum is not None: - hex_digest_stored = hex_digests[checksum_algorithm] - if hex_digest_stored != checksum: + if checksum_algorithm not in hex_digests: exception_string = ( - "FileHashStore - _validate_object: Hex digest and checksum" - + f" do not match - file not stored for pid: {pid}. Algorithm:" - + f" {checksum_algorithm}. Checksum provided: {checksum} !=" - + f" HexDigest: {hex_digest_stored}." + f"FileHashStore - _validate_object: checksum_algorithm ({checksum_algorithm})" + + " cannot be found in the hex digests dictionary." ) - if pid is not None: - self.delete(entity, tmp_file_name) - exception_string_for_pid = ( - exception_string + f"Tmp file ({tmp_file_name}) deleted." + logging.error(exception_string) + raise KeyError(exception_string) + else: + hex_digest_stored = hex_digests[checksum_algorithm] + if hex_digest_stored != checksum: + exception_string = ( + "FileHashStore - _validate_object: Hex digest and checksum" + + f" do not match - file not stored for pid: {pid}. Algorithm:" + + f" {checksum_algorithm}. Checksum provided: {checksum} !=" + + f" HexDigest: {hex_digest_stored}." ) - logging.error(exception_string_for_pid) - raise ValueError(exception_string_for_pid) - else: - logging.error(exception_string) - raise ValueError(exception_string) + if pid is not None: + self.delete(entity, tmp_file_name) + exception_string_for_pid = ( + exception_string + f"Tmp file ({tmp_file_name}) deleted." + ) + logging.error(exception_string_for_pid) + raise ValueError(exception_string_for_pid) + else: + logging.error(exception_string) + raise ValueError(exception_string) def _validate_references(self, pid, cid): """Verifies that the supplied pid and pid reference file and content have been diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 59c8b1ac..13ae988b 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -443,6 +443,105 @@ def test_move_and_get_checksums_file_size_raises_error(pids, store): input_stream.close() +def test_validate_object(pids, store): + """Test _validate_object succeeds given good arguments.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + hex_digests = object_metadata.hex_digests + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + # pylint: disable=W0212 + store._validate_object( + None, + checksum, + checksum_algorithm, + None, + hex_digests, + None, + expected_file_size, + expected_file_size, + ) + + +def test_validate_object_incorrect_size(pids, store): + """Test _validate_object throws exception when size is incorrect.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + hex_digests = object_metadata.hex_digests + checksum = hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + with pytest.raises(ValueError): + # pylint: disable=W0212 + store._validate_object( + None, + checksum, + checksum_algorithm, + None, + hex_digests, + None, + 1000, + 2000, + ) + + +def test_validate_object_incorrect_size_with_pid(pids, store): + """Test _validate_object deletes the expected tmp file if obj size does not match + and raises an exception.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + hex_digests = object_metadata.hex_digests + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + + objects_tmp_folder = store.objects + "/tmp" + # pylint: disable=W0212 + tmp_file = store._mktmpfile(objects_tmp_folder) + assert os.path.isfile(tmp_file.name) + with pytest.raises(ValueError): + store._validate_object( + "Test_Pid", + checksum, + checksum_algorithm, + None, + hex_digests, + tmp_file.name, + 1000, + expected_file_size, + ) + assert not os.path.isfile(tmp_file.name) + + +def test_validate_object_missing_key_in_hex_digests(pids, store): + """Test _validate_object throws exception when algorithm is not found in hex digests.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = "blake2s" + expected_file_size = object_metadata.obj_size + with pytest.raises(KeyError): + store.verify_object( + object_metadata, checksum, checksum_algorithm, expected_file_size + ) + + def test_write_to_tmp_file_and_get_hex_digests_additional_algo(store): """Test _write...hex_digests returns correct hex digests for additional algorithm.""" test_dir = "tests/testdata/" diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 566849da..8e815f92 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -126,15 +126,6 @@ def test_store_object_pid_empty_spaces(store): store.store_object(" ", path) -def test_store_object_pid_none(store): - """Test store object raises error when supplied with 'None' pid.""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid - with pytest.raises(ValueError): - store.store_object(None, path) - - def test_store_object_data_incorrect_type_none(store): """Test store object raises error when data is 'None'.""" pid = "jtao.1700.1" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 3cfda1a9..b4871877 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -112,6 +112,22 @@ def test_tag_object_cid_refs_update(pids, store): assert "dou.test.1" in cid_ref_file_pid +def test_verify_object(pids, store): + """Test verify object succeeds given good arguments.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + store.verify_object( + object_metadata, checksum, checksum_algorithm, expected_file_size + ) + + def test_write_cid_refs_file(pids, store): """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): From 03f2b44dec89a1ff388fd2543938f0ca4678436b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 Nov 2023 10:14:23 -0800 Subject: [PATCH 60/71] Clean up code to improve clarity --- src/hashstore/filehashstore.py | 219 +++++++++++++------------ tests/test_filehashstore.py | 6 +- tests/test_filehashstore_references.py | 32 ++-- tests/test_hashstore.py | 31 ++-- 4 files changed, 148 insertions(+), 140 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3b7db430..7f6dd806 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -80,6 +80,7 @@ def __init__(self, properties=None): checked_properties[property_name] for property_name in self.property_required_keys ] + # TODO: Ensure that store algorithm in properties is compatible with HashStore # Check to see if a configuration is present in the given store path self.hashstore_configuration_yaml = prop_store_path + "/hashstore.yaml" @@ -418,7 +419,7 @@ def store_object( checksum_algorithm=None, expected_object_size=None, ): - if pid is None and self._validate_data_to_store(data): + if pid is None and self._validate_arg_data(data): logging.debug("FileHashStore - store_object: Request to store data only.") object_metadata = self.store_data_only(data) logging.info( @@ -431,12 +432,12 @@ def store_object( ) # Validate input parameters self._is_string_none_or_empty(pid, "pid", "store_object") - self._validate_data_to_store(data) + self._validate_arg_data(data) self._is_int_and_non_negative(expected_object_size) ( additional_algorithm_checked, checksum_algorithm_checked, - ) = self._validate_algorithms_and_checksum( + ) = self._validate_arg_algorithms_and_checksum( additional_algorithm, checksum, checksum_algorithm ) @@ -515,7 +516,7 @@ def verify_object( object_metadata_hex_digests = object_metadata.hex_digests object_metadata_file_size = object_metadata.obj_size checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) - self._validate_object( + self._validate_arg_object( pid=None, checksum=checksum, checksum_algorithm=checksum_algorithm_checked, @@ -596,7 +597,7 @@ def tag_object(self, pid, cid): shutil.move(cid_tmp_file_path, cid_ref_abs_path) # Ensure that the reference files have been written as expected # If there is an issue, client or user will have to manually review - self._validate_references(pid, cid) + self._verify_hashstore_references(pid, cid) logging.info( "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", @@ -639,8 +640,8 @@ def store_metadata(self, pid, metadata, format_id=None): ) # Validate input parameters self._is_string_none_or_empty(pid, "pid", "store_metadata") - checked_format_id = self._validate_format_id(format_id, "store_metadata") - self._validate_metadata_to_store(metadata) + checked_format_id = self._validate_arg_format_id(format_id, "store_metadata") + self._validate_arg_metadata(metadata) # Wait for the pid to release if it's in use while pid in self.metadata_locked_pids: @@ -713,7 +714,7 @@ def retrieve_metadata(self, pid, format_id=None): pid, ) self._is_string_none_or_empty(pid, "pid", "retrieve_metadata") - checked_format_id = self._validate_format_id(format_id, "retrieve_metadata") + checked_format_id = self._validate_arg_format_id(format_id, "retrieve_metadata") entity = "metadata" metadata_cid = self.computehash(pid + checked_format_id) @@ -788,7 +789,7 @@ def delete_metadata(self, pid, format_id=None): pid, ) self._is_string_none_or_empty(pid, "pid", "delete_metadata") - checked_format_id = self._validate_format_id(format_id, "delete_metadata") + checked_format_id = self._validate_arg_format_id(format_id, "delete_metadata") entity = "metadata" metadata_cid = self.computehash(pid + checked_format_id) @@ -995,7 +996,7 @@ def _move_and_get_checksums( # file and calculate the hex digests because the given checksum could be incorrect. if not os.path.isfile(abs_file_path): # Files are stored once and only once - self._validate_object( + self._validate_arg_object( pid, checksum, checksum_algorithm, @@ -1513,9 +1514,9 @@ def _mktmpmetadata(self, stream): # FileHashStore Utility & Supporting Methods - def _validate_data_to_store(self, data): - """Evaluates a data argument to ensure that it is either a string, path or - stream object before attempting to store it. + def _validate_arg_data(self, data): + """Checks a data argument to ensure that it is either a string, path or stream + object. Args: data (string, path, stream): object to validate @@ -1529,7 +1530,7 @@ def _validate_data_to_store(self, data): and not isinstance(data, io.BufferedIOBase) ): exception_string = ( - "FileHashStore - store_object: Data must be a path, string or buffered" + "FileHashStore - _validate_arg_data: Data must be a path, string or buffered" + f" stream type. Data type supplied: {type(data)}" ) logging.error(exception_string) @@ -1537,13 +1538,13 @@ def _validate_data_to_store(self, data): if isinstance(data, str): if data.replace(" ", "") == "": exception_string = ( - "FileHashStore - store_object: Data string cannot be empty." + "FileHashStore - _validate_arg_data: Data string cannot be empty." ) logging.error(exception_string) raise TypeError(exception_string) return True - def _validate_algorithms_and_checksum( + def _validate_arg_algorithms_and_checksum( self, additional_algorithm, checksum, checksum_algorithm ): """Determines whether caller has supplied the necessary arguments to validate @@ -1553,6 +1554,10 @@ def _validate_algorithms_and_checksum( additional_algorithm: value of additional algorithm to calculate checksum (string): value of checksum checksum_algorithm (string): algorithm of checksum + + Returns: + additional_algorithm_checked (string): hashlib compatible string or 'None' + checksum_algorithm_checked (string): hashlib compatible string or 'None' """ additional_algorithm_checked = None if additional_algorithm != self.algorithm and additional_algorithm is not None: @@ -1575,41 +1580,7 @@ def _validate_algorithms_and_checksum( checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) return additional_algorithm_checked, checksum_algorithm_checked - def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): - """Create the final list of hash algorithms to calculate - - Args: - additional_algorithm (string) - checksum_algorithm (string) - - Return: - algorithm_list_to_calculate (set): De-duplicated list of hash algorithms - """ - algorithm_list_to_calculate = self.default_algo_list - if checksum_algorithm is not None: - self.clean_algorithm(checksum_algorithm) - if checksum_algorithm in self.other_algo_list: - debug_additional_other_algo_str = ( - f"FileHashStore - _refine_algorithm_list: checksum algo: {checksum_algorithm}" - + " found in other_algo_lists, adding to list of algorithms to calculate." - ) - logging.debug(debug_additional_other_algo_str) - algorithm_list_to_calculate.append(checksum_algorithm) - if additional_algorithm is not None: - self.clean_algorithm(additional_algorithm) - if additional_algorithm in self.other_algo_list: - debug_additional_other_algo_str = ( - f"FileHashStore - _refine_algorithm_list: addit algo: {additional_algorithm}" - + " found in other_algo_lists, adding to list of algorithms to calculate." - ) - logging.debug(debug_additional_other_algo_str) - algorithm_list_to_calculate.append(additional_algorithm) - - # Remove duplicates - algorithm_list_to_calculate = set(algorithm_list_to_calculate) - return algorithm_list_to_calculate - - def _validate_object( + def _validate_arg_object( self, pid, checksum, @@ -1635,7 +1606,7 @@ def _validate_object( if file_size_to_validate is not None and file_size_to_validate > 0: if file_size_to_validate != tmp_file_size: exception_string = ( - "FileHashStore - _validate_object: Object file size calculated: " + "FileHashStore - _validate_arg_object: Object file size calculated: " + f" {tmp_file_size} does not match with expected size:" + f"{file_size_to_validate}." ) @@ -1653,8 +1624,8 @@ def _validate_object( if checksum_algorithm is not None and checksum is not None: if checksum_algorithm not in hex_digests: exception_string = ( - f"FileHashStore - _validate_object: checksum_algorithm ({checksum_algorithm})" - + " cannot be found in the hex digests dictionary." + "FileHashStore - _validate_arg_object: checksum_algorithm" + + f" ({checksum_algorithm}) cannot be found in the hex digests dictionary." ) logging.error(exception_string) raise KeyError(exception_string) @@ -1662,7 +1633,7 @@ def _validate_object( hex_digest_stored = hex_digests[checksum_algorithm] if hex_digest_stored != checksum: exception_string = ( - "FileHashStore - _validate_object: Hex digest and checksum" + "FileHashStore - _validate_arg_object: Hex digest and checksum" + f" do not match - file not stored for pid: {pid}. Algorithm:" + f" {checksum_algorithm}. Checksum provided: {checksum} !=" + f" HexDigest: {hex_digest_stored}." @@ -1678,7 +1649,57 @@ def _validate_object( logging.error(exception_string) raise ValueError(exception_string) - def _validate_references(self, pid, cid): + def _validate_arg_metadata(self, metadata): + """Evaluates a metadata argument to ensure that it is either a string, path or + stream object before attempting to store it. + + Args: + metadata (string, path, stream): metadata to validate + """ + if isinstance(metadata, str): + if metadata.replace(" ", "") == "": + exception_string = ( + "FileHashStore - store_metadata: Given string path to" + + " metadata cannot be empty." + ) + logging.error(exception_string) + raise TypeError(exception_string) + if ( + not isinstance(metadata, str) + and not isinstance(metadata, Path) + and not isinstance(metadata, io.BufferedIOBase) + ): + exception_string = ( + "FileHashStore - store_metadata: Metadata must be a path or string" + + f" type, data type supplied: {type(metadata)}" + ) + logging.error(exception_string) + raise TypeError(exception_string) + + def _validate_arg_format_id(self, format_id, method): + """Determines the metadata namespace (format_id) to use for storing, + retrieving and deleting metadata. + + Args: + format_id (string): Metadata namespace to review + method (string): Calling method for logging purposes + + Returns: + checked_format_id (string): Valid metadata namespace + """ + checked_format_id = None + if format_id is not None and format_id.replace(" ", "") == "": + exception_string = f"FileHashStore - {method}: Format_id cannot be empty." + logging.error(exception_string) + raise ValueError(exception_string) + elif format_id is None: + # Use default value set by hashstore config + checked_format_id = self.sysmeta_ns + else: + checked_format_id = format_id + return checked_format_id + + def _verify_hashstore_references(self, pid, cid): """Verifies that the supplied pid and pid reference file and content have been written successfully. @@ -1691,14 +1712,14 @@ def _validate_references(self, pid, cid): cid_ref_abs_path = self.get_refs_abs_path("cid", cid) if not os.path.exists(pid_ref_abs_path): exception_string = ( - "FileHashStore - _validate_references: Pid refs file missing: %s", + "FileHashStore - _verify_hashstore_references: Pid refs file missing: %s", pid_ref_abs_path, ) logging.error(exception_string) raise FileNotFoundError(exception_string) if not os.path.exists(cid_ref_abs_path): exception_string = ( - "FileHashStore - _validate_references: Cid refs file missing: %s", + "FileHashStore - _verify_hashstore_references: Cid refs file missing: %s", cid_ref_abs_path, ) logging.error(exception_string) @@ -1708,8 +1729,8 @@ def _validate_references(self, pid, cid): retrieved_cid = self.find_object(pid) if retrieved_cid != cid: exception_string = ( - f"FileHashStore - _validate_references: Pid refs file exists ({pid_ref_abs_path})" - + f" but cid ({cid}) does not match." + "FileHashStore - _verify_hashstore_references: Pid refs file exists" + + f" ({pid_ref_abs_path}) but cid ({cid}) does not match." ) logging.error(exception_string) raise ValueError(exception_string) @@ -1722,61 +1743,45 @@ def _validate_references(self, pid, cid): pid_found = True if not pid_found: exception_string = ( - f"FileHashStore - _validate_references: Cid refs file exists ({cid_ref_abs_path})" - + f" but pid ({pid}) not found." + "FileHashStore - _verify_hashstore_references: Cid refs file exists" + + f" ({cid_ref_abs_path}) but pid ({pid}) not found." ) logging.error(exception_string) raise ValueError(exception_string) - def _validate_metadata_to_store(self, metadata): - """Evaluates a metadata argument to ensure that it is either a string, path or - stream object before attempting to store it. + def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): + """Create the final list of hash algorithms to calculate. Args: - metadata (string, path, stream): metadata to validate + additional_algorithm (string) + checksum_algorithm (string) + + Return: + algorithm_list_to_calculate (set): De-duplicated list of hash algorithms """ - if isinstance(metadata, str): - if metadata.replace(" ", "") == "": - exception_string = ( - "FileHashStore - store_metadata: Given string path to" - + " metadata cannot be empty." + algorithm_list_to_calculate = self.default_algo_list + if checksum_algorithm is not None: + self.clean_algorithm(checksum_algorithm) + if checksum_algorithm in self.other_algo_list: + debug_additional_other_algo_str = ( + f"FileHashStore - _refine_algorithm_list: checksum algo: {checksum_algorithm}" + + " found in other_algo_lists, adding to list of algorithms to calculate." ) - logging.error(exception_string) - raise TypeError(exception_string) - if ( - not isinstance(metadata, str) - and not isinstance(metadata, Path) - and not isinstance(metadata, io.BufferedIOBase) - ): - exception_string = ( - "FileHashStore - store_metadata: Metadata must be a path or string" - + f" type, data type supplied: {type(metadata)}" - ) - logging.error(exception_string) - raise TypeError(exception_string) - - def _validate_format_id(self, format_id, method): - """Determines the metadata namespace (format_id) to use for storing, - retrieving and deleting metadata. - - Args: - format_id (string): Metadata namespace to review - method (string): Calling method for logging purposes + logging.debug(debug_additional_other_algo_str) + algorithm_list_to_calculate.append(checksum_algorithm) + if additional_algorithm is not None: + self.clean_algorithm(additional_algorithm) + if additional_algorithm in self.other_algo_list: + debug_additional_other_algo_str = ( + f"FileHashStore - _refine_algorithm_list: addit algo: {additional_algorithm}" + + " found in other_algo_lists, adding to list of algorithms to calculate." + ) + logging.debug(debug_additional_other_algo_str) + algorithm_list_to_calculate.append(additional_algorithm) - Returns: - checked_format_id (string): Valid metadata namespace - """ - checked_format_id = None - if format_id is not None and format_id.replace(" ", "") == "": - exception_string = f"FileHashStore - {method}: Format_id cannot be empty." - logging.error(exception_string) - raise ValueError(exception_string) - elif format_id is None: - # Use default value set by hashstore config - checked_format_id = self.sysmeta_ns - else: - checked_format_id = format_id - return checked_format_id + # Remove duplicates + algorithm_list_to_calculate = set(algorithm_list_to_calculate) + return algorithm_list_to_calculate def clean_algorithm(self, algorithm_string): """Format a string and ensure that it is supported and compatible with @@ -1810,7 +1815,7 @@ def clean_algorithm(self, algorithm_string): return cleaned_string def computehash(self, stream, algorithm=None): - """Compute the hash of a file-like object (or string) using :attr:`algorithm` by + """Compute the hash of a file-like object (or string) using the store algorthm by default or with optional algorithm supported. Args: diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 13ae988b..455b39fa 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -456,7 +456,7 @@ def test_validate_object(pids, store): checksum_algorithm = store.algorithm expected_file_size = object_metadata.obj_size # pylint: disable=W0212 - store._validate_object( + store._validate_arg_object( None, checksum, checksum_algorithm, @@ -481,7 +481,7 @@ def test_validate_object_incorrect_size(pids, store): checksum_algorithm = store.algorithm with pytest.raises(ValueError): # pylint: disable=W0212 - store._validate_object( + store._validate_arg_object( None, checksum, checksum_algorithm, @@ -512,7 +512,7 @@ def test_validate_object_incorrect_size_with_pid(pids, store): tmp_file = store._mktmpfile(objects_tmp_folder) assert os.path.isfile(tmp_file.name) with pytest.raises(ValueError): - store._validate_object( + store._validate_arg_object( "Test_Pid", checksum, checksum_algorithm, diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index b4871877..b009f62e 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -346,38 +346,38 @@ def test_delete_pid_refs_file_file_not_found(pids, store): store._delete_cid_refs_file(pid_ref_abs_path) -def test_validate_references_pid_refs_file_missing(pids, store): - """Test that validate_references throws exception when pid refs file is missing.""" +def test_verify_hashstore_references_pid_refs_file_missing(pids, store): + """Test _verify_hashstore_references throws exception when pid refs file is missing.""" for pid in pids.keys(): cid = pids[pid]["sha256"] with pytest.raises(FileNotFoundError): - store._validate_references(pid, cid) + store._verify_hashstore_references(pid, cid) -def test_validate_references_pid_refs_incorrect_cid(pids, store): - """Test that validate_references throws exception when pid refs file cid is incorrect.""" +def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): + """Test _verify_hashstore_references throws exception when pid refs file cid is incorrect.""" for pid in pids.keys(): cid = pids[pid]["sha256"] pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store._write_pid_refs_file(pid_ref_abs_path, "bad_cid") with pytest.raises(FileNotFoundError): - store._validate_references(pid, cid) + store._verify_hashstore_references(pid, cid) -def test_validate_references_cid_refs_file_missing(pids, store): - """Test that validate_references throws exception when cid refs file is missing.""" +def test_verify_hashstore_references_cid_refs_file_missing(pids, store): + """Test _verify_hashstore_references throws exception when cid refs file is missing.""" for pid in pids.keys(): cid = pids[pid]["sha256"] pid_ref_abs_path = store.get_refs_abs_path("pid", pid) store.create_path(os.path.dirname(pid_ref_abs_path)) store._write_pid_refs_file(pid_ref_abs_path, cid) with pytest.raises(FileNotFoundError): - store._validate_references(pid, cid) + store._verify_hashstore_references(pid, cid) -def test_validate_references_cid_refs_file_missing_pid(pids, store): - """Test that validate_references throws exception when cid refs file does not contain +def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): + """Test _verify_hashstore_references throws exception when cid refs file does not contain the expected pid.""" for pid in pids.keys(): cid = pids[pid]["sha256"] @@ -388,11 +388,13 @@ def test_validate_references_cid_refs_file_missing_pid(pids, store): store._write_pid_refs_file(pid_ref_abs_path, cid) store._write_cid_refs_file(cid_ref_abs_path, "bad_pid") with pytest.raises(ValueError): - store._validate_references(pid, cid) + store._verify_hashstore_references(pid, cid) -def test_validate_references_cid_refs_file_with_multiple_refs_missing_pid(pids, store): - """Test that validate_references throws exception when cid refs file with multiple +def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pid( + pids, store +): + """Test _verify_hashstore_references throws exception when cid refs file with multiple references does not contain the expected pid.""" for pid in pids.keys(): cid = pids[pid]["sha256"] @@ -410,4 +412,4 @@ def test_validate_references_cid_refs_file_with_multiple_refs_missing_pid(pids, cid_reference_list.append(f"dou.test.{i}") with pytest.raises(ValueError): - store._validate_references(pid, cid) + store._verify_hashstore_references(pid, cid) diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index 953e0fac..b0b57ca8 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -43,7 +43,8 @@ def test_factory_get_hashstore_unsupported_module(factory): def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): - """Check factory raises exception with store algorithm value that part of the default list.""" + """Check factory raises exception with store algorithm value that is not part of + the default list.""" module_name = "hashstore.filehashstore" class_name = "FileHashStore" @@ -58,20 +59,20 @@ def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): factory.get_hashstore(module_name, class_name, properties) -def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory): - """Check factory raises exception with incorrectly formatted algorithm value.""" - module_name = "hashstore.filehashstore" - class_name = "FileHashStore" - - properties = { - "store_path": os.getcwd() + "/metacat/test", - "store_depth": 3, - "store_width": 2, - "store_algorithm": "sha256", - "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", - } - with pytest.raises(ValueError): - factory.get_hashstore(module_name, class_name, properties) +# def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory): +# """Check factory raises exception with incorrectly formatted algorithm value.""" +# module_name = "hashstore.filehashstore" +# class_name = "FileHashStore" + +# properties = { +# "store_path": os.getcwd() + "/metacat/test", +# "store_depth": 3, +# "store_width": 2, +# "store_algorithm": "dou_algo", +# "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", +# } +# with pytest.raises(ValueError): +# factory.get_hashstore(module_name, class_name, properties) def test_objectmetadata(): From 8536f1ea9fafe2ca3c1c9b8d8270cf231bad939d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 Nov 2023 13:16:57 -0800 Subject: [PATCH 61/71] Clean up code, review tests and fix minor bugs and revise docstrings and comments --- src/hashstore/filehashstore.py | 18 +- tests/conftest.py | 3 - tests/test_filehashstore.py | 280 ++++++++++++++++------------- tests/test_filehashstore_stream.py | 2 + tests/test_hashstore.py | 28 +-- tests/test_hashstore_client.py | 39 +++- 6 files changed, 216 insertions(+), 154 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7f6dd806..7dce4653 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -80,7 +80,6 @@ def __init__(self, properties=None): checked_properties[property_name] for property_name in self.property_required_keys ] - # TODO: Ensure that store algorithm in properties is compatible with HashStore # Check to see if a configuration is present in the given store path self.hashstore_configuration_yaml = prop_store_path + "/hashstore.yaml" @@ -89,8 +88,6 @@ def __init__(self, properties=None): # If no exceptions thrown, FileHashStore ready for initialization logging.debug("FileHashStore - Initializing, properties verified.") self.root = prop_store_path - if not os.path.exists(self.root): - self.create_path(self.root) self.depth = prop_store_depth self.width = prop_store_width self.sysmeta_ns = prop_store_metadata_namespace @@ -154,7 +151,7 @@ def load_properties(self): # Get hashstore properties hashstore_yaml_dict = {} for key in self.property_required_keys: - if key is not "store_path": + if key != "store_path": hashstore_yaml_dict[key] = yaml_data[key] logging.debug( "FileHashStore - load_properties: Successfully retrieved 'hashstore.yaml' properties." @@ -211,6 +208,10 @@ def write_properties(self, properties): logging.error(exception_string) raise ValueError(exception_string) + # If given store path doesn't exist yet, create it. + if not os.path.exists(self.root): + self.create_path(self.root) + # .yaml file to write hashstore_configuration_yaml = self._build_hashstore_yaml_string( store_depth, @@ -307,7 +308,7 @@ def _verify_hashstore_properties(self, properties, prop_store_path): hashstore_yaml_dict = self.load_properties() for key in self.property_required_keys: # 'store_path' is required to init HashStore but not saved in `hashstore.yaml` - if key is not "store_path": + if key != "store_path": supplied_key = properties[key] if key == "store_depth" or key == "store_width": supplied_key = int(properties[key]) @@ -1071,8 +1072,9 @@ def _write_to_tmp_file_and_get_hex_digests( self, stream, additional_algorithm=None, checksum_algorithm=None ): """Create a named temporary file from a `Stream` object and return its filename - and a dictionary of its algorithms and hex digests. If an additionak and/or checksum - algorithm is provided, it will add the respective hex digest to the dictionary. + and a dictionary of its algorithms and hex digests. If an additional and/or checksum + algorithm is provided, it will add the respective hex digest to the dictionary if + it is supported. Args: stream (io.BufferedReader): Object stream. @@ -2058,7 +2060,7 @@ def get_refs_abs_path(self, ref_type, hash_id): ref_file_abs_path (string): Path to the ref file for the given type and pid """ entity = "refs" - if ref_type is "pid": + if ref_type == "pid": hash_id = self.computehash(hash_id, self.algorithm) ref_file_abs_path = self.build_abs_path(entity, hash_id).replace( "/refs/", f"/refs/{ref_type}/" diff --git a/tests/conftest.py b/tests/conftest.py index 9b25c520..54af3542 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,7 +47,6 @@ def init_pids(): test_pids = { "doi:10.18739/A2901ZH2M": { "file_size_bytes": 39993, - "object_cid": "0d555ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e", "metadata_cid": "323e0799524cec4c7e14d31289cefd884b563b5c052f154a066de5ec1e477da7", "md5": "db91c910a3202478c8def1071c54aae5", "sha1": "1fe86e3c8043afa4c70857ca983d740ad8501ccd", @@ -58,7 +57,6 @@ def init_pids(): }, "jtao.1700.1": { "file_size_bytes": 8724, - "object_cid": "a8241925740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf", "metadata_cid": "ddf07952ef28efc099d10d8b682480f7d2da60015f5d8873b6e1ea75b4baf689", "md5": "f4ea2d07db950873462a064937197b0f", "sha1": "3d25436c4490b08a2646e283dada5c60e5c0539d", @@ -69,7 +67,6 @@ def init_pids(): }, "urn:uuid:1b35d0a5-b17a-423b-a2ed-de2b18dc367a": { "file_size_bytes": 18699, - "object_cid": "7f5cc18f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6", "metadata_cid": "9a2e08c666b728e6cbd04d247b9e556df3de5b2ca49f7c5a24868eb27cddbff2", "md5": "e1932fc75ca94de8b64f1d73dc898079", "sha1": "c6d2a69a3f5adaf478ba796c114f57b990cf7ad1", diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 455b39fa..d6ee134f 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1,4 +1,4 @@ -"""Test module for FileHashStore core, utility and supporting methods.""" +"""Test module for FileHashStore init, core, utility and supporting methods.""" import io import os from pathlib import Path @@ -6,6 +6,9 @@ from hashstore.filehashstore import FileHashStore +# Tests for HashStore Configuration and Related Methods + + def test_pids_length(pids): """Ensure test harness pids are present.""" assert len(pids) == 3 @@ -19,14 +22,16 @@ def test_init_directories_created(store): assert os.path.exists(store.metadata) assert os.path.exists(store.metadata + "/tmp") assert os.path.exists(store.refs) + assert os.path.exists(store.refs + "/tmp") assert os.path.exists(store.refs + "/pid") assert os.path.exists(store.refs + "/cid") def test_init_existing_store_incorrect_algorithm_format(store): - """Confirm that exception is thrown when store_algorithm is not a DataONE controlled value""" + """Confirm that exception is thrown when store_algorithm is not a DataONE + controlled value.""" properties = { - "store_path": store.root, + "store_path": store.root + "/incorrect_algo_format", "store_depth": 3, "store_width": 2, "store_algorithm": "sha256", @@ -37,7 +42,7 @@ def test_init_existing_store_incorrect_algorithm_format(store): def test_init_existing_store_correct_algorithm_format(store): - """Confirm second instance of HashStore with DataONE controlled value""" + """Confirm second instance of HashStore with DataONE controlled value.""" properties = { "store_path": store.root, "store_depth": 3, @@ -55,7 +60,8 @@ def test_init_write_properties_hashstore_yaml_exists(store): def test_init_with_existing_hashstore_mismatched_config_depth(store): - """Test init with existing HashStore raises ValueError with mismatching properties.""" + """Test init with existing HashStore raises a ValueError when supplied with + mismatching depth.""" properties = { "store_path": store.root, "store_depth": 1, @@ -68,7 +74,8 @@ def test_init_with_existing_hashstore_mismatched_config_depth(store): def test_init_with_existing_hashstore_mismatched_config_width(store): - """Test init with existing HashStore raises ValueError with mismatching properties.""" + """Test init with existing HashStore raises a ValueError when supplied with + mismatching width.""" properties = { "store_path": store.root, "store_depth": 3, @@ -81,7 +88,8 @@ def test_init_with_existing_hashstore_mismatched_config_width(store): def test_init_with_existing_hashstore_mismatched_config_algo(store): - """Test init with existing HashStore raises ValueError with mismatching properties.""" + """Test init with existing HashStore raises a ValueError when supplied with + mismatching default algorithm.""" properties = { "store_path": store.root, "store_depth": 3, @@ -94,7 +102,8 @@ def test_init_with_existing_hashstore_mismatched_config_algo(store): def test_init_with_existing_hashstore_mismatched_config_metadata_ns(store): - """Test init with existing HashStore raises ValueError with mismatching properties.""" + """Test init with existing HashStore raises a ValueError when supplied with + mismatching default name space.""" properties = { "store_path": store.root, "store_depth": 3, @@ -185,7 +194,7 @@ def test_validate_properties_key_value_is_none(store): def test_validate_properties_incorrect_type(store): - """Confirm exception raised when key missing in properties.""" + """Confirm exception raised when a bad properties value is given.""" properties = "etc/filehashstore/hashstore.yaml" with pytest.raises(ValueError): # pylint: disable=W0212 @@ -205,8 +214,11 @@ def test_set_default_algorithms_missing_yaml(store, pids): store._set_default_algorithms() +# Tests for FileHashStore Core Methods + + def test_store_and_validate_data_files_path(pids, store): - """Test store_and_validate_data objects with path object.""" + """Test store_and_validate_data objects with path object for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -217,7 +229,7 @@ def test_store_and_validate_data_files_path(pids, store): def test_store_and_validate_data_files_string(pids, store): - """Test store_and_validate_data objects with string.""" + """Test store_and_validate_data objects with string for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -228,7 +240,7 @@ def test_store_and_validate_data_files_string(pids, store): def test_store_and_validate_data_files_stream(pids, store): - """Test store_and_validate_data objects with stream.""" + """Test store_and_validate_data objects with stream for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -290,7 +302,7 @@ def test_store_and_validate_data_additional_algorithm(pids, store): def test_store_and_validate_data_with_correct_checksums(pids, store): - """Check store_and_validate_data success with valid checksum and checksum algorithm supplied.""" + """Check store_and_validate_data with valid checksum and checksum algorithm supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" @@ -303,7 +315,7 @@ def test_store_and_validate_data_with_correct_checksums(pids, store): def test_store_and_validate_data_with_incorrect_checksum(pids, store): - """Check store_and_validate_data fails when bad checksum supplied.""" + """Check store_and_validate_data fails when a bad checksum supplied.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -423,8 +435,8 @@ def test_move_and_get_checksums_duplicates_raises_error(pids, store): assert store.count(entity) == 3 -def test_move_and_get_checksums_file_size_raises_error(pids, store): - """Test move and get checksum raises error with incorrect file size""" +def test_move_and_get_checksums_incorrect_file_size(pids, store): + """Test move and get checksum raises error with an incorrect file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): with pytest.raises(ValueError): @@ -443,107 +455,8 @@ def test_move_and_get_checksums_file_size_raises_error(pids, store): input_stream.close() -def test_validate_object(pids, store): - """Test _validate_object succeeds given good arguments.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) - hex_digests = object_metadata.hex_digests - checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = store.algorithm - expected_file_size = object_metadata.obj_size - # pylint: disable=W0212 - store._validate_arg_object( - None, - checksum, - checksum_algorithm, - None, - hex_digests, - None, - expected_file_size, - expected_file_size, - ) - - -def test_validate_object_incorrect_size(pids, store): - """Test _validate_object throws exception when size is incorrect.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) - hex_digests = object_metadata.hex_digests - checksum = hex_digests.get(store.algorithm) - checksum_algorithm = store.algorithm - with pytest.raises(ValueError): - # pylint: disable=W0212 - store._validate_arg_object( - None, - checksum, - checksum_algorithm, - None, - hex_digests, - None, - 1000, - 2000, - ) - - -def test_validate_object_incorrect_size_with_pid(pids, store): - """Test _validate_object deletes the expected tmp file if obj size does not match - and raises an exception.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) - hex_digests = object_metadata.hex_digests - checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = store.algorithm - expected_file_size = object_metadata.obj_size - - objects_tmp_folder = store.objects + "/tmp" - # pylint: disable=W0212 - tmp_file = store._mktmpfile(objects_tmp_folder) - assert os.path.isfile(tmp_file.name) - with pytest.raises(ValueError): - store._validate_arg_object( - "Test_Pid", - checksum, - checksum_algorithm, - None, - hex_digests, - tmp_file.name, - 1000, - expected_file_size, - ) - assert not os.path.isfile(tmp_file.name) - - -def test_validate_object_missing_key_in_hex_digests(pids, store): - """Test _validate_object throws exception when algorithm is not found in hex digests.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) - checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = "blake2s" - expected_file_size = object_metadata.obj_size - with pytest.raises(KeyError): - store.verify_object( - object_metadata, checksum, checksum_algorithm, expected_file_size - ) - - def test_write_to_tmp_file_and_get_hex_digests_additional_algo(store): - """Test _write...hex_digests returns correct hex digests for additional algorithm.""" + """Test _write...hex_digests returns correct hex digests with an additional algorithm.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -561,7 +474,8 @@ def test_write_to_tmp_file_and_get_hex_digests_additional_algo(store): def test_write_to_tmp_file_and_get_hex_digests_checksum_algo(store): - """Test _write...hex_digests returns correct hex digests for checksum algorithm.""" + """Test _write...hex_digests returns correct hex digests when given a checksum_algorithm + is provided.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -579,7 +493,8 @@ def test_write_to_tmp_file_and_get_hex_digests_checksum_algo(store): def test_write_to_tmp_file_and_get_hex_digests_checksum_and_additional_algo(store): - """Test _write...hex_digests returns correct hex digests for checksum algorithm.""" + """Test _write...hex_digests returns correct hex digests when an additional and + checksum algorithm is provided.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -653,7 +568,7 @@ def test_write_to_tmp_file_and_get_hex_digests_hex_digests(pids, store): def test_write_to_tmp_file_and_get_hex_digests_tmpfile_object(pids, store): - """Test _write...hex_digests creates file successfully.""" + """Test _write...hex_digests returns a tmp file successfully.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -665,7 +580,7 @@ def test_write_to_tmp_file_and_get_hex_digests_tmpfile_object(pids, store): def test_write_to_tmp_file_and_get_hex_digests_with_unsupported_algorithm(pids, store): - """Test _write...hex_digests raises error when bad algorithm supplied.""" + """Test _write...hex_digests raises an exception when an unsupported algorithm supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -694,27 +609,27 @@ def test_mktmpfile(store): def test_put_metadata_with_path(pids, store): - """Test put_metadata with path object.""" + """Test put_metadata with path object for the path arg.""" entity = "metadata" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, syspath, format_id) + metadata_cid = store.put_metadata(syspath, pid, format_id) assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 def test_put_metadata_with_string(pids, store): - """Test_put metadata with string.""" + """Test_put metadata with string for the path arg.""" entity = "metadata" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = str(Path(test_dir) / filename) - metadata_cid = store.store_metadata(pid, syspath, format_id) + metadata_cid = store.put_metadata(syspath, pid, format_id) assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 @@ -726,14 +641,13 @@ def test_put_metadata_cid(pids, store): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, syspath, format_id) + metadata_cid = store.put_metadata(syspath, pid, format_id) assert metadata_cid == pids[pid]["metadata_cid"] def test_mktmpmetadata(pids, store): """Test mktmpmetadata creates tmpFile.""" test_dir = "tests/testdata/" - entity = "metadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename @@ -741,7 +655,117 @@ def test_mktmpmetadata(pids, store): # pylint: disable=W0212 tmp_name = store._mktmpmetadata(sys_stream) sys_stream.close() - assert store.exists(entity, tmp_name) + assert os.path.exists(tmp_name) + + +# Tests for FileHashStore Utility & Supporting Methods + + +def test_validate_arg_object(pids, store): + """Test _validate_arg_object succeeds given good arguments.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + hex_digests = object_metadata.hex_digests + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + # pylint: disable=W0212 + store._validate_arg_object( + None, + checksum, + checksum_algorithm, + None, + hex_digests, + None, + expected_file_size, + expected_file_size, + ) + + +def test_validate_arg_object_incorrect_size(pids, store): + """Test _validate_arg_object throws exception when size is incorrect.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + hex_digests = object_metadata.hex_digests + checksum = hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + with pytest.raises(ValueError): + # pylint: disable=W0212 + store._validate_arg_object( + None, + checksum, + checksum_algorithm, + None, + hex_digests, + None, + 1000, + 2000, + ) + + +def test_validate_arg_object_incorrect_size_with_pid(pids, store): + """Test _validate_arg_object deletes the expected tmp file if obj size does + not match and raises an exception.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + hex_digests = object_metadata.hex_digests + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + + objects_tmp_folder = store.objects + "/tmp" + # pylint: disable=W0212 + tmp_file = store._mktmpfile(objects_tmp_folder) + assert os.path.isfile(tmp_file.name) + with pytest.raises(ValueError): + store._validate_arg_object( + "Test_Pid", + checksum, + checksum_algorithm, + None, + hex_digests, + tmp_file.name, + 1000, + expected_file_size, + ) + assert not os.path.isfile(tmp_file.name) + + +def test_validate_arg_object_missing_key_in_hex_digests(pids, store): + """Test _validate_arg_object throws exception when algorithm is not found in hex digests.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = "blake2s" + expected_file_size = object_metadata.obj_size + with pytest.raises(KeyError): + # pylint: disable=W0212 + store._validate_arg_object( + None, + checksum, + checksum_algorithm, + None, + object_metadata.hex_digests, + None, + expected_file_size, + expected_file_size, + ) def test_clean_algorithm(store): @@ -849,7 +873,7 @@ def test_open_objects(pids, store): def test_delete_by_object_metadata_id(pids, store): - """Check objects are deleted after calling delete with hash address id.""" + """Check objects are deleted after calling delete with object id.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): diff --git a/tests/test_filehashstore_stream.py b/tests/test_filehashstore_stream.py index 94e6c412..29fa4d20 100644 --- a/tests/test_filehashstore_stream.py +++ b/tests/test_filehashstore_stream.py @@ -15,6 +15,7 @@ def test_stream_reads_file(pids): hashobj = hashlib.new("sha256") for data in obj_stream: hashobj.update(data) + obj_stream.close() hex_digest = hashobj.hexdigest() assert pids[pid]["sha256"] == hex_digest @@ -28,6 +29,7 @@ def test_stream_reads_path_object(pids): hashobj = hashlib.new("sha256") for data in obj_stream: hashobj.update(data) + obj_stream.close() hex_digest = hashobj.hexdigest() assert pids[pid]["sha256"] == hex_digest diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index b0b57ca8..e161c967 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -59,20 +59,20 @@ def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): factory.get_hashstore(module_name, class_name, properties) -# def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory): -# """Check factory raises exception with incorrectly formatted algorithm value.""" -# module_name = "hashstore.filehashstore" -# class_name = "FileHashStore" - -# properties = { -# "store_path": os.getcwd() + "/metacat/test", -# "store_depth": 3, -# "store_width": 2, -# "store_algorithm": "dou_algo", -# "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", -# } -# with pytest.raises(ValueError): -# factory.get_hashstore(module_name, class_name, properties) +def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory): + """Check factory raises exception with incorrectly formatted algorithm value.""" + module_name = "hashstore.filehashstore" + class_name = "FileHashStore" + + properties = { + "store_path": os.getcwd() + "/metacat/test", + "store_depth": 3, + "store_width": 2, + "store_algorithm": "dou_algo", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + with pytest.raises(ValueError): + factory.get_hashstore(module_name, class_name, properties) def test_objectmetadata(): diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index d7ec6324..ede33f3a 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -41,6 +41,43 @@ def test_create_hashstore(tmp_path): assert os.path.exists(hashstore_client_python_log) +def test_get_checksum(capsys, store, pids): + """Test calculating a hash via HashStore through client.""" + client_directory = os.getcwd() + "/src/hashstore" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + store.tag_object(pid, object_metadata.id) + + client_module_path = f"{client_directory}/client.py" + test_store = store.root + get_checksum_opt = "-getchecksum" + client_pid_arg = f"-pid={pid}" + algo_arg = f"-algo={store.algorithm}" + chs_args = [ + client_module_path, + test_store, + get_checksum_opt, + client_pid_arg, + algo_arg, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + client.main() + + capsystext = capsys.readouterr().out + expected_output = ( + f"guid/pid: {pid}\n" + + f"algorithm: {store.algorithm}\n" + + f"Checksum/Hex Digest: {pids[pid][store.algorithm]}\n" + ) + assert capsystext == expected_output + + def test_store_object(store, pids): """Test storing objects to HashStore through client.""" client_directory = os.getcwd() + "/src/hashstore" @@ -209,7 +246,7 @@ def test_delete_objects(pids, store): sys.argv = chs_args client.main() - assert not store.exists("objects", pids[pid]["object_cid"]) + assert not store.exists("objects", pids[pid][store.algorithm]) def test_delete_metadata(pids, store): From f993fb98ee4264028a38d7dc386acc11d25083d4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 Nov 2023 13:27:24 -0800 Subject: [PATCH 62/71] Add pytests for 'verify_object' --- src/hashstore/filehashstore.py | 9 ++-- tests/test_filehashstore_references.py | 62 ++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 5 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7dce4653..08b38713 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -496,11 +496,6 @@ def verify_object( checksum_algorithm (string): Algorithm of checksum expected_file_size (int): Size of the tmp file """ - # TODO: Write tests - logging.debug( - "FileHashStore - verify_object: Called to verify object with id: %s", - object_metadata.id, - ) self._is_string_none_or_empty(checksum, "checksum", "verify_object") self._is_string_none_or_empty( checksum_algorithm, "checksum_algorithm", "verify_object" @@ -514,6 +509,10 @@ def verify_object( logging.error(exception_string) raise ValueError(exception_string) else: + logging.info( + "FileHashStore - verify_object: Called to verify object with id: %s", + object_metadata.id, + ) object_metadata_hex_digests = object_metadata.hex_digests object_metadata_file_size = object_metadata.obj_size checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index b009f62e..507ee509 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -128,6 +128,68 @@ def test_verify_object(pids, store): ) +def test_verify_object_exception_incorrect_object_metadata_type(pids, store): + """Test verify object raises exception when incorrect object is given to + object_metadata arg.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + with pytest.raises(ValueError): + store.verify_object( + "bad_type", checksum, checksum_algorithm, expected_file_size + ) + + +def test_verify_object_exception_incorrect_size(pids, store): + """Test verify object raises exception when incorrect size is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + with pytest.raises(ValueError): + store.verify_object(object_metadata, checksum, checksum_algorithm, 1000) + + +def test_verify_object_exception_incorrect_checksum(pids, store): + """Test verify object raises exception when incorrect checksum is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + with pytest.raises(ValueError): + store.verify_object( + object_metadata, "abc123", checksum_algorithm, expected_file_size + ) + + +def test_verify_object_exception_incorrect_checksum_algo(pids, store): + """Test verify object raises exception when incorrect algorithm is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum = object_metadata.hex_digests.get(store.algorithm) + expected_file_size = object_metadata.obj_size + with pytest.raises(ValueError): + store.verify_object(object_metadata, checksum, "md2", expected_file_size) + + def test_write_cid_refs_file(pids, store): """Test that write_cid_reference writes a reference file.""" for pid in pids.keys(): From 687df49b9f1ad9b923be0ecef14c3f061d5a2103 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 Nov 2023 13:50:17 -0800 Subject: [PATCH 63/71] Refactor 'store_object' to also tag object when a pid is supplied and revise all pytests --- src/hashstore/filehashstore.py | 4 +++- src/hashstore/hashstore.py | 15 ++++++++------- tests/test_filehashstore.py | 8 -------- tests/test_filehashstore_interface.py | 15 ++++----------- tests/test_filehashstore_references.py | 16 ++++++++-------- tests/test_hashstore_client.py | 9 +++------ 6 files changed, 26 insertions(+), 41 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 08b38713..adce204e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -421,6 +421,7 @@ def store_object( expected_object_size=None, ): if pid is None and self._validate_arg_data(data): + # If no pid is supplied, store the object only without tagging logging.debug("FileHashStore - store_object: Request to store data only.") object_metadata = self.store_data_only(data) logging.info( @@ -428,6 +429,7 @@ def store_object( object_metadata.id, ) else: + # Else the object will be stored and tagged logging.debug( "FileHashStore - store_object: Request to store object for pid: %s", pid ) @@ -469,7 +471,7 @@ def store_object( checksum_algorithm=checksum_algorithm_checked, file_size_to_validate=expected_object_size, ) - # TODO: Tag object afterwards and fix pytests + self.tag_object(pid, object_metadata.id) logging.info( "FileHashStore - store_object: Successfully stored object for pid: %s", pid, diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 606a2496..1fc27ebb 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -28,9 +28,14 @@ def store_object( disk using a given stream. Upon successful storage, the method returns a ObjectMetadata object containing relevant file information, such as the file's id (which can be used to locate the object on disk), the file's size, and a hex digest dict of algorithms - and checksums. `store_object` also ensures that an object is stored only once by - synchronizing multiple calls and rejecting calls to store duplicate objects. Lastly, - it should call `tag_object` to create the references to allow the object to be found. + and checksums. Storing an object with `store_object` also tags an object (creating + references) which allow the object to be discoverable. + + `store_object` also ensures that an object is stored only once by synchronizing multiple + calls and rejecting calls to store duplicate objects. Note, calling `store_object` without + a pid is a possibility, but should only store the object without tagging the object. + It is then the caller's responsibility to finalize the process by calling `tag_object` + after veriftying the correct object is stored. The file's id is determined by calculating the object's content identifier based on the store's default algorithm, which is also used as the permanent address of the file. @@ -49,10 +54,6 @@ def store_object( `store_object` validates the object to ensure it matches the given arguments before moving the file to its permanent address. - Note, calling `store_object` is a possibility, but should only store the object - without calling `tag_object`. It is the caller's responsibility to finalize the - process by calling `tag_object` after veriftying the correct object is stored. - Args: pid (string): Authority-based identifier. data (mixed): String or path to object. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index d6ee134f..ce04ecec 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -667,8 +667,6 @@ def test_validate_arg_object(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) hex_digests = object_metadata.hex_digests checksum = object_metadata.hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm @@ -692,8 +690,6 @@ def test_validate_arg_object_incorrect_size(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) hex_digests = object_metadata.hex_digests checksum = hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm @@ -718,8 +714,6 @@ def test_validate_arg_object_incorrect_size_with_pid(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) hex_digests = object_metadata.hex_digests checksum = object_metadata.hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm @@ -749,8 +743,6 @@ def test_validate_arg_object_missing_key_in_hex_digests(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(data=path) - cid = object_metadata.id - store.tag_object(pid, cid) checksum = object_metadata.hex_digests.get(store.algorithm) checksum_algorithm = "blake2s" expected_file_size = object_metadata.obj_size diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 8e815f92..ec418b9c 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -518,7 +518,6 @@ def test_find_object(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) cid = store.find_object(pid) assert cid == object_metadata.hex_digests.get("sha256") @@ -703,7 +702,6 @@ def test_retrieve_object(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) store.store_metadata(pid, syspath, format_id) obj_stream = store.retrieve_object(pid) sha256_hex = store.computehash(obj_stream) @@ -801,8 +799,7 @@ def test_delete_objects(pids, store): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) + _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) assert store.count(entity) == 0 @@ -816,8 +813,7 @@ def test_delete_objects_pid_refs_file(pids, store): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) + _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) pid_refs_file_path = store.get_refs_abs_path("pid", pid) @@ -833,9 +829,8 @@ def test_delete_objects_cid_refs_file(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) - cid = object_metadata.id - store.tag_object(pid, cid) _metadata_cid = store.store_metadata(pid, syspath, format_id) + cid = object_metadata.id store.delete_object(pid) cid_refs_file_path = store.get_refs_abs_path("cid", cid) assert not os.path.exists(cid_refs_file_path) @@ -851,7 +846,6 @@ def test_delete_objects_cid_refs_file_with_pid_refs_remaining(pids, store): syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) cid = object_metadata.id - store.tag_object(pid, cid) cid_refs_abs_path = store.get_refs_abs_path("cid", cid) # pylint: disable=W0212 store._update_cid_refs(cid_refs_abs_path, "dou.test.1") @@ -937,8 +931,7 @@ def test_get_hex_digest(store): path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) + _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) sha3_256_hex_digest = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 507ee509..e4974bcc 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -10,7 +10,7 @@ def test_tag_object(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(None, path) object_tagged = store.tag_object(pid, object_metadata.id) assert object_tagged @@ -20,7 +20,7 @@ def test_tag_object_pid_refs_file(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.id) pid_refs_file_path = store.get_refs_abs_path("pid", pid) assert os.path.exists(pid_refs_file_path) @@ -31,7 +31,7 @@ def test_tag_object_pid_refs_file_exists(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(None, path) cid = object_metadata.id store.tag_object(pid, cid) pid_refs_file_path = store.get_refs_abs_path("pid", pid) @@ -47,7 +47,7 @@ def test_tag_object_pid_refs_file_content(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.id) pid_refs_file_path = store.get_refs_abs_path("pid", pid) with open(pid_refs_file_path, "r", encoding="utf8") as f: @@ -60,7 +60,7 @@ def test_tag_object_cid_refs_file(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(None, path) cid = object_metadata.id store.tag_object(pid, object_metadata.id) cid_refs_file_path = store.get_refs_abs_path("cid", cid) @@ -72,7 +72,7 @@ def test_tag_object_cid_refs_file_content(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.id) cid_refs_file_path = store.get_refs_abs_path("cid", object_metadata.id) with open(cid_refs_file_path, "r", encoding="utf8") as f: @@ -86,7 +86,7 @@ def test_tag_object_cid_refs_file_exists(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.id) another_cid = "dou.test.1" with pytest.raises(FileExistsError): @@ -101,7 +101,7 @@ def test_tag_object_cid_refs_update(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + object_metadata = store.store_object(None, path) cid = object_metadata.id store.tag_object(pid, cid) store.tag_object("dou.test.1", cid) diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index ede33f3a..1d61fd17 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -47,8 +47,7 @@ def test_get_checksum(capsys, store, pids): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) + store.store_object(pid, path) client_module_path = f"{client_directory}/client.py" test_store = store.root @@ -145,8 +144,7 @@ def test_retrieve_objects(capsys, pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) + store.store_object(pid, path) client_module_path = f"{client_directory}/client.py" test_store = store.root @@ -226,8 +224,7 @@ def test_delete_objects(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - store.tag_object(pid, object_metadata.id) + store.store_object(pid, path) client_module_path = f"{client_directory}/client.py" test_store = store.root From d8fe8620ae5d1e9246c9514dfa563baad5e05e9e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 6 Dec 2023 09:50:32 -0800 Subject: [PATCH 64/71] Clean up 'filehashstore' class for doc strings, typos and syntax formatting --- .gitignore | 1 + src/hashstore/filehashstore.py | 25 +++++++++++-------------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index 09ccd077..c2a663ae 100644 --- a/.gitignore +++ b/.gitignore @@ -131,6 +131,7 @@ venv/ ENV/ env.bak/ venv.bak/ +.idea # Spyder project settings .spyderproject diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index adce204e..f74a6072 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -201,9 +201,8 @@ def write_properties(self, properties): else: exception_string = ( f"FileHashStore - write_properties: algorithm supplied ({store_algorithm})" - + " cannot be used as default for HashStore. Must be one of:" - + " MD5, SHA-1, SHA-256, SHA-384, SHA-512 which are DataONE" - + " controlled algorithm values" + f" cannot be used as default for HashStore. Must be one of: {', '.join(accepted_store_algorithms)}" + f" which are DataONE controlled algorithm values" ) logging.error(exception_string) raise ValueError(exception_string) @@ -238,7 +237,6 @@ def _build_hashstore_yaml_string( """Build a YAML string representing the configuration for a HashStore. Args: - store_path (str): Path to the HashStore directory. store_depth (int): Depth when sharding an object's hex digest. store_width (int): Width of directories when sharding an object's hex digest. store_algorithm (str): Hash algorithm used for calculating the object's hex digest. @@ -374,7 +372,7 @@ def _validate_properties(self, properties): def _set_default_algorithms(self): """Set the default algorithms to calculate when storing objects.""" - def lookup_algo(algo): + def lookup_algo(algo_to_translate): """Translate DataONE controlled algorithms to python hashlib values: https://dataoneorg.github.io/api-documentation/apis/Types.html#Types.ChecksumAlgorithm """ @@ -385,7 +383,7 @@ def lookup_algo(algo): "SHA-384": "sha384", "SHA-512": "sha512", } - return dataone_algo_translation[algo] + return dataone_algo_translation[algo_to_translate] if not os.path.exists(self.hashstore_configuration_yaml): exception_string = ( @@ -490,7 +488,7 @@ def store_object( def verify_object( self, object_metadata, checksum, checksum_algorithm, expected_file_size ): - """Confirms that a object_metadata's content is equal to the given values. + """Confirms that an object_metadata's content is equal to the given values. Args: object_metadata (ObjectMetadata): object_metadata object @@ -1067,7 +1065,7 @@ def _move_and_get_checksums( self.delete(entity, tmp_file_name) raise FileExistsError(exception_string) - return (object_cid, tmp_file_size, hex_digests) + return object_cid, tmp_file_size, hex_digests def _write_to_tmp_file_and_get_hex_digests( self, stream, additional_algorithm=None, checksum_algorithm=None @@ -1079,7 +1077,7 @@ def _write_to_tmp_file_and_get_hex_digests( Args: stream (io.BufferedReader): Object stream. - algorithm (string): Algorithm of additional hex digest to generate + additional_algorithm (string): Algorithm of additional hex digest to generate checksum_algorithm (string): Algorithm of additional checksum algo to generate Returns: @@ -1195,7 +1193,7 @@ def delete_tmp_file(): def _write_cid_refs_file(self, path, pid): """Write the cid reference file in the supplied path to a file. A reference file contains every pid that references a cid each on its own line. This method will - only write into an empty file, and will not write over an an existing one. + only write into an empty file, and will not write over an existing one. Args: path (string): Path of file to be written into @@ -1483,11 +1481,10 @@ def put_metadata(self, metadata, pid, format_id): raise FileNotFoundError(exception_string) def _mktmpmetadata(self, stream): - """Create a named temporary file with `stream` (metadata) and `format_id`. + """Create a named temporary file with `stream` (metadata). Args: stream (io.BufferedReader): Metadata stream. - format_id (string): Format of metadata. Returns: tmp.name (string): Path/name of temporary file created and written into. @@ -1891,8 +1888,8 @@ def compact(items): # This creates a list of `depth` number of tokens with width # `width` from the first part of the id plus the remainder. hierarchical_list = compact( - [digest[i * self.width : self.width * (i + 1)] for i in range(self.depth)] - + [digest[self.depth * self.width :]] + [digest[i * self.width: self.width * (i + 1)] for i in range(self.depth)] + + [digest[self.depth * self.width:]] ) return hierarchical_list From caa9d7bf54ebff8058beba3de2d9415d247b44ef Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 6 Dec 2023 09:58:50 -0800 Subject: [PATCH 65/71] Remove redundant method '_validate_arg_metadata' and refactor 'store_metdata()' --- src/hashstore/filehashstore.py | 33 +++------------------------------ 1 file changed, 3 insertions(+), 30 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index f74a6072..deb4e407 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -641,7 +641,7 @@ def store_metadata(self, pid, metadata, format_id=None): # Validate input parameters self._is_string_none_or_empty(pid, "pid", "store_metadata") checked_format_id = self._validate_arg_format_id(format_id, "store_metadata") - self._validate_arg_metadata(metadata) + self._validate_arg_data(metadata) # Wait for the pid to release if it's in use while pid in self.metadata_locked_pids: @@ -1649,33 +1649,6 @@ def _validate_arg_object( logging.error(exception_string) raise ValueError(exception_string) - def _validate_arg_metadata(self, metadata): - """Evaluates a metadata argument to ensure that it is either a string, path or - stream object before attempting to store it. - - Args: - metadata (string, path, stream): metadata to validate - """ - if isinstance(metadata, str): - if metadata.replace(" ", "") == "": - exception_string = ( - "FileHashStore - store_metadata: Given string path to" - + " metadata cannot be empty." - ) - logging.error(exception_string) - raise TypeError(exception_string) - if ( - not isinstance(metadata, str) - and not isinstance(metadata, Path) - and not isinstance(metadata, io.BufferedIOBase) - ): - exception_string = ( - "FileHashStore - store_metadata: Metadata must be a path or string" - + f" type, data type supplied: {type(metadata)}" - ) - logging.error(exception_string) - raise TypeError(exception_string) - def _validate_arg_format_id(self, format_id, method): """Determines the metadata namespace (format_id) to use for storing, retrieving and deleting metadata. @@ -1888,8 +1861,8 @@ def compact(items): # This creates a list of `depth` number of tokens with width # `width` from the first part of the id plus the remainder. hierarchical_list = compact( - [digest[i * self.width: self.width * (i + 1)] for i in range(self.depth)] - + [digest[self.depth * self.width:]] + [digest[i * self.width : self.width * (i + 1)] for i in range(self.depth)] + + [digest[self.depth * self.width :]] ) return hierarchical_list From 5cde868fe81e36c4fd5891342b6ea8ef0444d463 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 6 Dec 2023 10:07:19 -0800 Subject: [PATCH 66/71] Refactor '_is_string_none_or_empty' to call .strip() instead of .replace() to account for spaces, tabs and newline characters --- src/hashstore/filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index deb4e407..58ccad60 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2096,7 +2096,7 @@ def _is_string_none_or_empty(string, arg, method): arg (string): Name of argument to check method (string): Calling method for logging purposes """ - if string is None or string.replace(" ", "") == "": + if string is None or string.strip() == "": exception_string = ( f"FileHashStore - {method}: {arg} cannot be None" + f" or empty, {arg}: {string}." From 9db5a26d8dcf2e725781b674fa982c2b1ebb74b6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 6 Dec 2023 10:17:37 -0800 Subject: [PATCH 67/71] Rename method '_is_string_none_or_empty' to '_validate_string' for accuracy and refactor accordingly --- src/hashstore/filehashstore.py | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 58ccad60..71ede0bc 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -432,7 +432,7 @@ def store_object( "FileHashStore - store_object: Request to store object for pid: %s", pid ) # Validate input parameters - self._is_string_none_or_empty(pid, "pid", "store_object") + self._validate_string(pid, "pid", "store_object") self._validate_arg_data(data) self._is_int_and_non_negative(expected_object_size) ( @@ -496,10 +496,8 @@ def verify_object( checksum_algorithm (string): Algorithm of checksum expected_file_size (int): Size of the tmp file """ - self._is_string_none_or_empty(checksum, "checksum", "verify_object") - self._is_string_none_or_empty( - checksum_algorithm, "checksum_algorithm", "verify_object" - ) + self._validate_string(checksum, "checksum", "verify_object") + self._validate_string(checksum_algorithm, "checksum_algorithm", "verify_object") self._is_int_and_non_negative(expected_file_size) if object_metadata is None or not isinstance(object_metadata, ObjectMetadata): exception_string = ( @@ -537,8 +535,8 @@ def tag_object(self, pid, cid): cid, pid, ) - self._is_string_none_or_empty(pid, "pid", "tag_object") - self._is_string_none_or_empty(cid, "cid", "tag_object") + self._validate_string(pid, "pid", "tag_object") + self._validate_string(cid, "cid", "tag_object") # Wait for the cid to release if it's being tagged while cid in self.reference_locked_cids: logging.debug( @@ -618,7 +616,7 @@ def find_object(self, pid): logging.debug( "FileHashStore - find_object: Request to find object for for pid: %s", pid ) - self._is_string_none_or_empty(pid, "pid", "find_object") + self._validate_string(pid, "pid", "find_object") pid_ref_abs_path = self.get_refs_abs_path("pid", pid) if not os.path.exists(pid_ref_abs_path): @@ -639,7 +637,7 @@ def store_metadata(self, pid, metadata, format_id=None): "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid ) # Validate input parameters - self._is_string_none_or_empty(pid, "pid", "store_metadata") + self._validate_string(pid, "pid", "store_metadata") checked_format_id = self._validate_arg_format_id(format_id, "store_metadata") self._validate_arg_data(metadata) @@ -685,7 +683,7 @@ def retrieve_object(self, pid): "FileHashStore - retrieve_object: Request to retrieve object for pid: %s", pid, ) - self._is_string_none_or_empty(pid, "pid", "retrieve_object") + self._validate_string(pid, "pid", "retrieve_object") object_cid = self.find_object(pid) entity = "objects" @@ -713,7 +711,7 @@ def retrieve_metadata(self, pid, format_id=None): "FileHashStore - retrieve_metadata: Request to retrieve metadata for pid: %s", pid, ) - self._is_string_none_or_empty(pid, "pid", "retrieve_metadata") + self._validate_string(pid, "pid", "retrieve_metadata") checked_format_id = self._validate_arg_format_id(format_id, "retrieve_metadata") entity = "metadata" @@ -737,7 +735,7 @@ def delete_object(self, pid): logging.debug( "FileHashStore - delete_object: Request to delete object for pid: %s", pid ) - self._is_string_none_or_empty(pid, "pid", "delete_object") + self._validate_string(pid, "pid", "delete_object") cid = self.find_object(pid) while cid in self.reference_locked_cids: @@ -788,7 +786,7 @@ def delete_metadata(self, pid, format_id=None): "FileHashStore - delete_metadata: Request to delete metadata for pid: %s", pid, ) - self._is_string_none_or_empty(pid, "pid", "delete_metadata") + self._validate_string(pid, "pid", "delete_metadata") checked_format_id = self._validate_arg_format_id(format_id, "delete_metadata") entity = "metadata" @@ -806,8 +804,8 @@ def get_hex_digest(self, pid, algorithm): "FileHashStore - get_hex_digest: Request to get hex digest for object with pid: %s", pid, ) - self._is_string_none_or_empty(pid, "pid", "get_hex_digest") - self._is_string_none_or_empty(algorithm, "algorithm", "get_hex_digest") + self._validate_string(pid, "pid", "get_hex_digest") + self._validate_string(algorithm, "algorithm", "get_hex_digest") entity = "objects" algorithm = self.clean_algorithm(algorithm) @@ -1565,13 +1563,13 @@ def _validate_arg_algorithms_and_checksum( additional_algorithm_checked = self.clean_algorithm(additional_algorithm) checksum_algorithm_checked = None if checksum is not None: - self._is_string_none_or_empty( + self._validate_string( checksum_algorithm, "checksum_algorithm", "validate_checksum_args (store_object)", ) if checksum_algorithm is not None: - self._is_string_none_or_empty( + self._validate_string( checksum, "checksum", "validate_checksum_args (store_object)", @@ -2088,7 +2086,7 @@ def _is_int_and_non_negative(file_size): raise ValueError(exception_string) @staticmethod - def _is_string_none_or_empty(string, arg, method): + def _validate_string(string, arg, method): """Checks whether a string is None or empty and throws an exception if so. Args: From da78588ebecdce087139ce2229401da2a720744b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 6 Dec 2023 10:27:51 -0800 Subject: [PATCH 68/71] Remove redundant instance check in '_is_int_and_non_negative' method --- src/hashstore/filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 71ede0bc..656d02c6 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2078,7 +2078,7 @@ def _is_int_and_non_negative(file_size): ) logging.error(exception_string) raise TypeError(exception_string) - if file_size < 1 or not isinstance(file_size, int): + if file_size < 1: exception_string = ( "FileHashStore - _is_int_and_non_negative: size given must be > 0" ) From 2580808d80e0a13c42efb969931629f6cec210d9 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 6 Dec 2023 11:21:40 -0800 Subject: [PATCH 69/71] Revise logging message accuracy in '_validate_arg_algorithms_and_checksum' --- src/hashstore/filehashstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 656d02c6..b15dbaae 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1566,13 +1566,13 @@ def _validate_arg_algorithms_and_checksum( self._validate_string( checksum_algorithm, "checksum_algorithm", - "validate_checksum_args (store_object)", + "_validate_arg_algorithms_and_checksum (store_object)", ) if checksum_algorithm is not None: self._validate_string( checksum, "checksum", - "validate_checksum_args (store_object)", + "_validate_arg_algorithms_and_checksum (store_object)", ) # Set checksum_algorithm checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) From 734074771e3860ce4b889a53a4f5ccdf562ad231 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 6 Dec 2023 12:01:53 -0800 Subject: [PATCH 70/71] Add 'verify_object' abstract method to 'HashStore' interface --- src/hashstore/hashstore.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 1fc27ebb..ab071551 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -84,6 +84,20 @@ def tag_object(self, pid, cid): """ raise NotImplementedError() + @abstractmethod + def verify_object( + self, object_metadata, checksum, checksum_algorithm, expected_file_size + ): + """Confirms that an object_metadata's content is equal to the given values. + + Args: + object_metadata (ObjectMetadata): object_metadata object + checksum (string): Value of checksum + checksum_algorithm (string): Algorithm of checksum + expected_file_size (int): Size of the tmp file + """ + raise NotImplementedError() + @abstractmethod def find_object(self, pid): """The `find_object` method checks whether an object referenced by a pid exists From f9a96d7497d73ae7a4b6031714f84016f734ce95 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 6 Dec 2023 12:11:34 -0800 Subject: [PATCH 71/71] Clean up code --- src/hashstore/client.py | 2 +- src/hashstore/filehashstore.py | 12 ++---------- src/hashstore/hashstore.py | 11 ++++++----- tests/test_filehashstore_interface.py | 8 ++++---- tests/test_hashstore_client.py | 2 -- 5 files changed, 13 insertions(+), 22 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index c1e2e4b6..dac73fcf 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -392,7 +392,7 @@ def validate_object(self, obj_tuple): obj_db_checksum = obj_tuple[2] with self.hashstore.retrieve_object(pid_guid) as obj_stream: - computed_digest = self.hashstore.computehash(obj_stream, algo) + computed_digest = self.hashstore.get_hex_digest(obj_stream, algo) obj_stream.close() if computed_digest != obj_db_checksum: diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b15dbaae..6e6c11bb 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -488,14 +488,6 @@ def store_object( def verify_object( self, object_metadata, checksum, checksum_algorithm, expected_file_size ): - """Confirms that an object_metadata's content is equal to the given values. - - Args: - object_metadata (ObjectMetadata): object_metadata object - checksum (string): Value of checksum - checksum_algorithm (string): Algorithm of checksum - expected_file_size (int): Size of the tmp file - """ self._validate_string(checksum, "checksum", "verify_object") self._validate_string(checksum_algorithm, "checksum_algorithm", "verify_object") self._is_int_and_non_negative(expected_file_size) @@ -1859,8 +1851,8 @@ def compact(items): # This creates a list of `depth` number of tokens with width # `width` from the first part of the id plus the remainder. hierarchical_list = compact( - [digest[i * self.width : self.width * (i + 1)] for i in range(self.depth)] - + [digest[self.depth * self.width :]] + [digest[i * self.width: self.width * (i + 1)] for i in range(self.depth)] + + [digest[self.depth * self.width:]] ) return hierarchical_list diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index ab071551..d1ff440c 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -2,6 +2,7 @@ from abc import ABC, abstractmethod from collections import namedtuple import importlib.metadata +import importlib.util class HashStore(ABC): @@ -256,12 +257,12 @@ def get_hashstore(module_name, class_name, properties=None): class ObjectMetadata(namedtuple("ObjectMetadata", ["id", "obj_size", "hex_digests"])): - """File address containing file's path on disk and its content hash ID. + """Represents metadata associated with an object. - Args: - ab_id (str): Hash ID (hexdigest) of file contents. - obj_size (bytes): Size of the object - hex_digests (dict, optional): A list of hex digests to validate objects + Attributes: + id (str): A unique identifier for the object (Hash ID, hex digest). + obj_size (bytes): The size of the object in bytes. + hex_digests (list, optional): A list of hex digests to validate objects (md5, sha1, sha256, sha384, sha512) """ diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index ec418b9c..c06c23d1 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -398,10 +398,10 @@ def test_store_object_duplicates_threads(pids, store): file_exists_error_flag = False - def store_object_wrapper(pid, path): + def store_object_wrapper(obj_pid, obj_path): nonlocal file_exists_error_flag try: - store.store_object(pid, path) # Call store_object inside the thread + store.store_object(obj_pid, obj_path) # Call store_object inside the thread except FileExistsError: file_exists_error_flag = True @@ -444,10 +444,10 @@ def test_store_object_interrupt_process(store): interrupt_flag = False - def store_object_wrapper(pid, path): + def store_object_wrapper(obj_pid, path): print(store.root) while not interrupt_flag: - store.store_object(pid, path) # Call store_object inside the thread + store.store_object(obj_pid, path) # Call store_object inside the thread # Create/start the thread thread = threading.Thread(target=store_object_wrapper, args=(pid, file_path)) diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 1d61fd17..96c9ad45 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -82,7 +82,6 @@ def test_store_object(store, pids): client_directory = os.getcwd() + "/src/hashstore" test_dir = "tests/testdata/" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") client_module_path = f"{client_directory}/client.py" test_store = store.root store_object_opt = "-storeobject" @@ -111,7 +110,6 @@ def test_store_metadata(store, pids): test_dir = "tests/testdata/" namespace = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename client_module_path = f"{client_directory}/client.py"