From 2a57998310b514e321e840b805a0e11e0f717795 Mon Sep 17 00:00:00 2001 From: jverswijver Date: Tue, 17 May 2022 09:23:25 -0500 Subject: [PATCH 01/28] Add option to disable checksums --- datajoint/fetch.py | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/datajoint/fetch.py b/datajoint/fetch.py index 936624400..327192349 100644 --- a/datajoint/fetch.py +++ b/datajoint/fetch.py @@ -32,7 +32,7 @@ def to_dicts(recarray): yield dict(zip(recarray.dtype.names, rec.tolist())) -def _get(connection, attr, data, squeeze, download_path): +def _get(connection, attr, data, squeeze, download_path, disable_checksum): """ This function is called for every attribute @@ -71,7 +71,7 @@ def _get(connection, attr, data, squeeze, download_path): else data.split(b"\0", 1)[0].decode() ) local_filepath = Path(download_path) / attachment_name - if local_filepath.is_file(): + if local_filepath.is_file() and not disable_checksum: attachment_checksum = ( _uuid if attr.is_external else hash.uuid_from_buffer(data) ) @@ -149,7 +149,8 @@ def __call__( format=None, as_dict=None, squeeze=False, - download_path="." + download_path=".", + disable_checksum=False ): """ Fetches the expression results from the database into an np.array or list of dictionaries and @@ -170,6 +171,18 @@ def __call__( :param download_path: for fetches that download data, e.g. attachments :return: the contents of the relation in the form of a structured numpy.array or a dict list """ + if disable_checksum and config["safemode"]: + warn_str = ( + b"Are you sure that you want to disable checksums?\n" + + b"There is no guarantee that any files you retrieve have been unmodified.\n" + + "(Y/N):" + ) + choice = input(warn_str) + if choice == "N": + return "Nothing fetched (fetch aborted)" + elif choice != "Y": + return "Nothing fetched (invalid prompt input)" + if order_by is not None: # if 'order_by' passed in a string, make into list if isinstance(order_by, str): @@ -220,6 +233,7 @@ def __call__( self._expression.connection, squeeze=squeeze, download_path=download_path, + disable_checksum=disable_checksum, ) if attrs: # a list of attributes provided attributes = [a for a in attrs if not is_key(a)] @@ -300,7 +314,9 @@ class Fetch1: def __init__(self, expression): self._expression = expression - def __call__(self, *attrs, squeeze=False, download_path="."): + def __call__( + self, *attrs, squeeze=False, download_path=".", disable_checksum=False + ): """ Fetches the result of a query expression that yields one entry. @@ -319,6 +335,18 @@ def __call__(self, *attrs, squeeze=False, download_path="."): """ heading = self._expression.heading + if disable_checksum and config["safemode"]: + warn_str = ( + b"Are you sure that you want to disable checksums?\n" + + b"There is no guarantee that any files you retrieve have been unmodified.\n" + + "(Y/N):" + ) + choice = input(warn_str) + if choice == "N": + return "Nothing fetched (fetch aborted)" + elif choice != "Y": + return "Nothing fetched (invalid prompt input)" + if not attrs: # fetch all attributes, return as ordered dict cur = self._expression.cursor(as_dict=True) ret = cur.fetchone() @@ -335,6 +363,7 @@ def __call__(self, *attrs, squeeze=False, download_path="."): ret[name], squeeze=squeeze, download_path=download_path, + disable_checksum=disable_checksum, ), ) for name in heading.names From 45d125017b77d2a5ad2d08c8d20bcc29d5583d06 Mon Sep 17 00:00:00 2001 From: jverswijver Date: Tue, 17 May 2022 09:35:32 -0500 Subject: [PATCH 02/28] fix minor bug. --- datajoint/fetch.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datajoint/fetch.py b/datajoint/fetch.py index 327192349..98e7f8052 100644 --- a/datajoint/fetch.py +++ b/datajoint/fetch.py @@ -173,8 +173,8 @@ def __call__( """ if disable_checksum and config["safemode"]: warn_str = ( - b"Are you sure that you want to disable checksums?\n" - + b"There is no guarantee that any files you retrieve have been unmodified.\n" + "Are you sure that you want to disable checksums?\n" + + "There is no guarantee that any files you retrieve have been unmodified.\n" + "(Y/N):" ) choice = input(warn_str) @@ -337,8 +337,8 @@ def __call__( if disable_checksum and config["safemode"]: warn_str = ( - b"Are you sure that you want to disable checksums?\n" - + b"There is no guarantee that any files you retrieve have been unmodified.\n" + "Are you sure that you want to disable checksums?\n" + + "There is no guarantee that any files you retrieve have been unmodified.\n" + "(Y/N):" ) choice = input(warn_str) From c0775ba16c7b64924eee415a678705107531f598 Mon Sep 17 00:00:00 2001 From: jverswijver Date: Thu, 19 May 2022 15:13:24 -0500 Subject: [PATCH 03/28] Revert commits --- datajoint/fetch.py | 37 ++++--------------------------------- 1 file changed, 4 insertions(+), 33 deletions(-) diff --git a/datajoint/fetch.py b/datajoint/fetch.py index 98e7f8052..936624400 100644 --- a/datajoint/fetch.py +++ b/datajoint/fetch.py @@ -32,7 +32,7 @@ def to_dicts(recarray): yield dict(zip(recarray.dtype.names, rec.tolist())) -def _get(connection, attr, data, squeeze, download_path, disable_checksum): +def _get(connection, attr, data, squeeze, download_path): """ This function is called for every attribute @@ -71,7 +71,7 @@ def _get(connection, attr, data, squeeze, download_path, disable_checksum): else data.split(b"\0", 1)[0].decode() ) local_filepath = Path(download_path) / attachment_name - if local_filepath.is_file() and not disable_checksum: + if local_filepath.is_file(): attachment_checksum = ( _uuid if attr.is_external else hash.uuid_from_buffer(data) ) @@ -149,8 +149,7 @@ def __call__( format=None, as_dict=None, squeeze=False, - download_path=".", - disable_checksum=False + download_path="." ): """ Fetches the expression results from the database into an np.array or list of dictionaries and @@ -171,18 +170,6 @@ def __call__( :param download_path: for fetches that download data, e.g. attachments :return: the contents of the relation in the form of a structured numpy.array or a dict list """ - if disable_checksum and config["safemode"]: - warn_str = ( - "Are you sure that you want to disable checksums?\n" - + "There is no guarantee that any files you retrieve have been unmodified.\n" - + "(Y/N):" - ) - choice = input(warn_str) - if choice == "N": - return "Nothing fetched (fetch aborted)" - elif choice != "Y": - return "Nothing fetched (invalid prompt input)" - if order_by is not None: # if 'order_by' passed in a string, make into list if isinstance(order_by, str): @@ -233,7 +220,6 @@ def __call__( self._expression.connection, squeeze=squeeze, download_path=download_path, - disable_checksum=disable_checksum, ) if attrs: # a list of attributes provided attributes = [a for a in attrs if not is_key(a)] @@ -314,9 +300,7 @@ class Fetch1: def __init__(self, expression): self._expression = expression - def __call__( - self, *attrs, squeeze=False, download_path=".", disable_checksum=False - ): + def __call__(self, *attrs, squeeze=False, download_path="."): """ Fetches the result of a query expression that yields one entry. @@ -335,18 +319,6 @@ def __call__( """ heading = self._expression.heading - if disable_checksum and config["safemode"]: - warn_str = ( - "Are you sure that you want to disable checksums?\n" - + "There is no guarantee that any files you retrieve have been unmodified.\n" - + "(Y/N):" - ) - choice = input(warn_str) - if choice == "N": - return "Nothing fetched (fetch aborted)" - elif choice != "Y": - return "Nothing fetched (invalid prompt input)" - if not attrs: # fetch all attributes, return as ordered dict cur = self._expression.cursor(as_dict=True) ret = cur.fetchone() @@ -363,7 +335,6 @@ def __call__( ret[name], squeeze=squeeze, download_path=download_path, - disable_checksum=disable_checksum, ), ) for name in heading.names From 6cee155076c4252c6291f5d629a15b08fb89c96a Mon Sep 17 00:00:00 2001 From: jverswijver Date: Wed, 25 May 2022 08:58:55 -0500 Subject: [PATCH 04/28] update checksum disable. --- datajoint/external.py | 34 ++++++++++++++++++++++++++++++---- datajoint/settings.py | 1 + 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index c04cc40c4..408c5501e 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -310,12 +310,38 @@ def download_filepath(self, filepath_hash): ) external_path = self._make_external_filepath(relative_filepath) local_filepath = Path(self.spec["stage"]).absolute() / relative_filepath - file_exists = ( + if ( + config["filepath_checksum_size_limit"] is not None + and Path(local_filepath).stat().st_size + > config["filepath_checksum_size_limit"] + ): + print(f"WARNING SKIPPING CHECKSUM FOR {filepath_hash}\n", flush=True) + if ( Path(local_filepath).is_file() - and uuid_from_file(local_filepath) == contents_hash - ) + and config["filepath_checksum_size_limit"] is None + ): + file_exists = uuid_from_file(local_filepath) == contents_hash + elif ( + Path(local_filepath).is_file() + and config["filepath_checksum_size_limit"] is not None + ): + file_exists = ( + True + if Path(local_filepath).stat().st_size # size in bytes + > config["filepath_checksum_size_limit"] + else (uuid_from_file(local_filepath) == contents_hash) + ) + else: + file_exists = False + if not file_exists: self._download_file(external_path, local_filepath) + if ( + config["filepath_checksum_size_limit"] is not None + and Path(local_filepath).stat().st_size + > config["filepath_checksum_size_limit"] + ): + return str(local_filepath), contents_hash checksum = uuid_from_file(local_filepath) if ( checksum != contents_hash @@ -402,7 +428,7 @@ def delete( delete_external_files=None, limit=None, display_progress=True, - errors_as_string=True + errors_as_string=True, ): """ diff --git a/datajoint/settings.py b/datajoint/settings.py index a0b59e0a5..1be33c7da 100644 --- a/datajoint/settings.py +++ b/datajoint/settings.py @@ -46,6 +46,7 @@ "display.show_tuple_count": True, "database.use_tls": None, "enable_python_native_blobs": True, # python-native/dj0 encoding support + "filepath_checksum_size_limit": None, # file size limit for when to disable checksums } ) From f949e38b20873b510c2352c98ab046a0c85b1a3a Mon Sep 17 00:00:00 2001 From: jverswijver Date: Wed, 25 May 2022 10:30:55 -0500 Subject: [PATCH 05/28] Refactor logic for if statements. --- datajoint/external.py | 62 +++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 35 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index 408c5501e..6e3805923 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -310,47 +310,39 @@ def download_filepath(self, filepath_hash): ) external_path = self._make_external_filepath(relative_filepath) local_filepath = Path(self.spec["stage"]).absolute() / relative_filepath - if ( - config["filepath_checksum_size_limit"] is not None - and Path(local_filepath).stat().st_size - > config["filepath_checksum_size_limit"] - ): - print(f"WARNING SKIPPING CHECKSUM FOR {filepath_hash}\n", flush=True) - if ( - Path(local_filepath).is_file() - and config["filepath_checksum_size_limit"] is None - ): - file_exists = uuid_from_file(local_filepath) == contents_hash - elif ( - Path(local_filepath).is_file() - and config["filepath_checksum_size_limit"] is not None - ): - file_exists = ( - True - if Path(local_filepath).stat().st_size # size in bytes - > config["filepath_checksum_size_limit"] - else (uuid_from_file(local_filepath) == contents_hash) + + file_exists = Path(local_filepath).is_file() and ( + ( + config["filepath_checksum_size_limit"] # check if None + and Path(local_filepath).stat().st_size + >= config["filepath_checksum_size_limit"] ) - else: - file_exists = False + or uuid_from_file(local_filepath) == contents_hash + ) if not file_exists: self._download_file(external_path, local_filepath) - if ( - config["filepath_checksum_size_limit"] is not None - and Path(local_filepath).stat().st_size - > config["filepath_checksum_size_limit"] + if not config["filepath_checksum_size_limit"] or ( + Path(local_filepath).stat().st_size + < config["filepath_checksum_size_limit"] ): - return str(local_filepath), contents_hash - checksum = uuid_from_file(local_filepath) - if ( - checksum != contents_hash - ): # this should never happen without outside interference - raise DataJointError( - "'{file}' downloaded but did not pass checksum'".format( - file=local_filepath + checksum = uuid_from_file(local_filepath) + if ( + checksum != contents_hash + ): # this should never happen without outside interference + raise DataJointError( + "'{file}' downloaded but did not pass checksum'".format( + file=local_filepath + ) ) - ) + if ( + config["filepath_checksum_size_limit"] + and Path(local_filepath).stat().st_size + >= config["filepath_checksum_size_limit"] + ): + print( + f"WARNING SKIPPED CHECKSUM FOR FILE WITH HASH: {contents_hash}" + ) # placeholder for logger return str(local_filepath), contents_hash # --- UTILITIES --- From 1ab945cdb658d7da9b5726932c39fb0cc843989d Mon Sep 17 00:00:00 2001 From: jverswijver Date: Wed, 25 May 2022 10:35:46 -0500 Subject: [PATCH 06/28] add logging --- datajoint/external.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index 6e3805923..b67334a83 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -9,6 +9,7 @@ from .declare import EXTERNAL_TABLE_ROOT from . import s3 from .utils import safe_write, safe_copy +import logging CACHE_SUBFOLDING = ( 2, @@ -340,9 +341,9 @@ def download_filepath(self, filepath_hash): and Path(local_filepath).stat().st_size >= config["filepath_checksum_size_limit"] ): - print( + logging.warning( f"WARNING SKIPPED CHECKSUM FOR FILE WITH HASH: {contents_hash}" - ) # placeholder for logger + ) return str(local_filepath), contents_hash # --- UTILITIES --- From dcaea1843f18284bc4a005b9aa8fc3e4a7681a08 Mon Sep 17 00:00:00 2001 From: jverswijver Date: Fri, 27 May 2022 10:13:47 -0500 Subject: [PATCH 07/28] add logging, add test, update changelog. --- CHANGELOG.md | 3 +++ datajoint/external.py | 20 ++++++++++---------- docs-parts/intro/Releases_lang1.rst | 4 ++++ tests/test_filepath.py | 13 +++++++++++-- 4 files changed, 28 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 029853156..d6e31dc15 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ ## Release notes +### 0.13.4 -- TBD +* Add - option to set threshold for when to stop using checksums for filepath stores. PR #1025 + ### 0.13.4 -- March, 28 2022 * Add - Allow reading blobs produced by legacy 32-bit compiled mYm library for matlab. PR #995 * Bugfix - Add missing `jobs` argument for multiprocessing PR #997 diff --git a/datajoint/external.py b/datajoint/external.py index b67334a83..b3b1fab38 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -11,6 +11,8 @@ from .utils import safe_write, safe_copy import logging +logger = logging.getLogger(__name__) + CACHE_SUBFOLDING = ( 2, 2, @@ -314,18 +316,18 @@ def download_filepath(self, filepath_hash): file_exists = Path(local_filepath).is_file() and ( ( - config["filepath_checksum_size_limit"] # check if None + config.get("filepath_checksum_size_limit") # check if None and Path(local_filepath).stat().st_size - >= config["filepath_checksum_size_limit"] + >= config.get("filepath_checksum_size_limit") ) or uuid_from_file(local_filepath) == contents_hash ) if not file_exists: self._download_file(external_path, local_filepath) - if not config["filepath_checksum_size_limit"] or ( + if not config.get("filepath_checksum_size_limit") or ( Path(local_filepath).stat().st_size - < config["filepath_checksum_size_limit"] + < config.get("filepath_checksum_size_limit") ): checksum = uuid_from_file(local_filepath) if ( @@ -336,12 +338,10 @@ def download_filepath(self, filepath_hash): file=local_filepath ) ) - if ( - config["filepath_checksum_size_limit"] - and Path(local_filepath).stat().st_size - >= config["filepath_checksum_size_limit"] - ): - logging.warning( + if config.get("filepath_checksum_size_limit") and Path( + local_filepath + ).stat().st_size >= config.get("filepath_checksum_size_limit"): + logger.warning( f"WARNING SKIPPED CHECKSUM FOR FILE WITH HASH: {contents_hash}" ) return str(local_filepath), contents_hash diff --git a/docs-parts/intro/Releases_lang1.rst b/docs-parts/intro/Releases_lang1.rst index 997f0eab2..0f217dd38 100644 --- a/docs-parts/intro/Releases_lang1.rst +++ b/docs-parts/intro/Releases_lang1.rst @@ -1,3 +1,7 @@ +0.13.4 -- TBD +---------------------- +* Add - option to set threshold for when to stop using checksums for filepath stores. PR #1025 + 0.13.4 -- March 28, 2022 ---------------------- * Add - Allow reading blobs produced by legacy 32-bit compiled mYm library for matlab. PR #995 diff --git a/tests/test_filepath.py b/tests/test_filepath.py index d0834b071..fda87a9af 100644 --- a/tests/test_filepath.py +++ b/tests/test_filepath.py @@ -3,7 +3,7 @@ import os from pathlib import Path import random - +import logging from .schema_external import schema, Filepath, FilepathS3, stores_config @@ -132,7 +132,9 @@ def test_duplicate_error_s3(): test_duplicate_error(store="repo-s3") -def test_filepath_class(table=Filepath(), store="repo"): +def test_filepath_class(table=Filepath(), store="repo", verify_checksum=True): + if not verify_checksum: + dj.config["filepath_checksum_size_limit"] = 1 # cant be 0, breaks boolean logic dj.errors._switch_filepath_types(True) stage_path = dj.config["stores"][store]["stage"] # create a mock file @@ -169,6 +171,9 @@ def test_filepath_class(table=Filepath(), store="repo"): # delete from external table table.external[store].delete(delete_external_files=True) dj.errors._switch_filepath_types(False) + dj.config["filepath_checksum_size_limit"] = None + # Todo: assert here that the warning from external.download_file has + # Waiting on DJ Logging implementation def test_filepath_class_again(): @@ -185,6 +190,10 @@ def test_filepath_class_s3_again(): test_filepath_class(FilepathS3(), "repo-s3") +def test_filepath_class_no_checksum(): + test_filepath_class(verify_checksum=False) + + def test_filepath_cleanup(table=Filepath(), store="repo"): """test deletion of filepath entries from external table""" From 5715b773669628bcd049731cffa123d0d65fb78d Mon Sep 17 00:00:00 2001 From: jverswijver Date: Fri, 27 May 2022 10:25:40 -0500 Subject: [PATCH 08/28] update changelog. --- CHANGELOG.md | 2 +- docs-parts/intro/Releases_lang1.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fcafd08f..c5ec810e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ ## Release notes ### 0.13.6 -- TBD -* Add - option to set threshold for when to stop using checksums for filepath stores. PR #1025 +* Add - Config option to set threshold for when to stop using checksums for filepath stores. PR #1025 ### 0.13.5 -- May 19, 2022 * Update - Import ABC from collections.abc for Python 3.10 compatibility diff --git a/docs-parts/intro/Releases_lang1.rst b/docs-parts/intro/Releases_lang1.rst index 4fc8a6256..145e6f97a 100644 --- a/docs-parts/intro/Releases_lang1.rst +++ b/docs-parts/intro/Releases_lang1.rst @@ -1,6 +1,6 @@ 0.13.6 -- TBD ---------------------- -* Add - option to set threshold for when to stop using checksums for filepath stores. PR #1025 +* Add - Config option to set threshold for when to stop using checksums for filepath stores. PR #1025 0.13.5 -- May 19, 2022 ---------------------- From aed8bf1d055af18c9ea56cd259397d47d9ecf52a Mon Sep 17 00:00:00 2001 From: jverswijver Date: Fri, 27 May 2022 10:35:31 -0500 Subject: [PATCH 09/28] minor change to comment. --- tests/test_filepath.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_filepath.py b/tests/test_filepath.py index fda87a9af..0fecc1e44 100644 --- a/tests/test_filepath.py +++ b/tests/test_filepath.py @@ -172,7 +172,7 @@ def test_filepath_class(table=Filepath(), store="repo", verify_checksum=True): table.external[store].delete(delete_external_files=True) dj.errors._switch_filepath_types(False) dj.config["filepath_checksum_size_limit"] = None - # Todo: assert here that the warning from external.download_file has + # Todo: assert here that the warning from external.download_file has been logged # Waiting on DJ Logging implementation From e64f32bbd8cb6ce99cd5742fe20b0b46fa645d21 Mon Sep 17 00:00:00 2001 From: jverswijver Date: Fri, 27 May 2022 10:54:18 -0500 Subject: [PATCH 10/28] minor changes to test and logger --- datajoint/external.py | 4 ++-- tests/test_filepath.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index b3b1fab38..bb5435bcf 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -341,9 +341,9 @@ def download_filepath(self, filepath_hash): if config.get("filepath_checksum_size_limit") and Path( local_filepath ).stat().st_size >= config.get("filepath_checksum_size_limit"): - logger.warning( + print( f"WARNING SKIPPED CHECKSUM FOR FILE WITH HASH: {contents_hash}" - ) + ) # This will turn into a proper logger when we implement the datajoint logger return str(local_filepath), contents_hash # --- UTILITIES --- diff --git a/tests/test_filepath.py b/tests/test_filepath.py index 0fecc1e44..2e8f8efcb 100644 --- a/tests/test_filepath.py +++ b/tests/test_filepath.py @@ -3,7 +3,6 @@ import os from pathlib import Path import random -import logging from .schema_external import schema, Filepath, FilepathS3, stores_config From 52902a6e3b4f2c72eaafdedd205b96558c0bf1bc Mon Sep 17 00:00:00 2001 From: jverswijver Date: Fri, 27 May 2022 13:22:23 -0500 Subject: [PATCH 11/28] fix style --- datajoint/external.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datajoint/external.py b/datajoint/external.py index bb5435bcf..476120479 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -343,7 +343,7 @@ def download_filepath(self, filepath_hash): ).stat().st_size >= config.get("filepath_checksum_size_limit"): print( f"WARNING SKIPPED CHECKSUM FOR FILE WITH HASH: {contents_hash}" - ) # This will turn into a proper logger when we implement the datajoint logger + ) # This will turn into a proper logger when we implement the datajoint logger return str(local_filepath), contents_hash # --- UTILITIES --- From 21d75e619ef1a1e2bc0ba14a7f3383addff32ec1 Mon Sep 17 00:00:00 2001 From: jverswijver Date: Fri, 27 May 2022 13:29:19 -0500 Subject: [PATCH 12/28] update logic --- datajoint/external.py | 17 ++++++++--------- tests/test_filepath.py | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index 476120479..eaac3f074 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -307,6 +307,12 @@ def download_filepath(self, filepath_hash): :param filepath_hash: The hash (UUID) of the relative_path :return: hash (UUID) of the contents of the downloaded file or Nones """ + + def _verify_contents(local_filepath): + return config.get("filepath_checksum_size_limit") is None or Path( + local_filepath + ).stat().st_size < config.get("filepath_checksum_size_limit") + if filepath_hash is not None: relative_filepath, contents_hash = (self & {"hash": filepath_hash}).fetch1( "filepath", "contents_hash" @@ -315,20 +321,13 @@ def download_filepath(self, filepath_hash): local_filepath = Path(self.spec["stage"]).absolute() / relative_filepath file_exists = Path(local_filepath).is_file() and ( - ( - config.get("filepath_checksum_size_limit") # check if None - and Path(local_filepath).stat().st_size - >= config.get("filepath_checksum_size_limit") - ) + not _verify_contents(local_filepath) or uuid_from_file(local_filepath) == contents_hash ) if not file_exists: self._download_file(external_path, local_filepath) - if not config.get("filepath_checksum_size_limit") or ( - Path(local_filepath).stat().st_size - < config.get("filepath_checksum_size_limit") - ): + if _verify_contents(local_filepath): checksum = uuid_from_file(local_filepath) if ( checksum != contents_hash diff --git a/tests/test_filepath.py b/tests/test_filepath.py index 2e8f8efcb..851ec0a57 100644 --- a/tests/test_filepath.py +++ b/tests/test_filepath.py @@ -133,7 +133,7 @@ def test_duplicate_error_s3(): def test_filepath_class(table=Filepath(), store="repo", verify_checksum=True): if not verify_checksum: - dj.config["filepath_checksum_size_limit"] = 1 # cant be 0, breaks boolean logic + dj.config["filepath_checksum_size_limit"] = 0 # cant be 0, breaks boolean logic dj.errors._switch_filepath_types(True) stage_path = dj.config["stores"][store]["stage"] # create a mock file From f49b17850459b7927bef707cc40cfc1ccb744116 Mon Sep 17 00:00:00 2001 From: jverswijver Date: Fri, 27 May 2022 13:33:02 -0500 Subject: [PATCH 13/28] fix style --- datajoint/external.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index eaac3f074..26721169b 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -329,9 +329,8 @@ def _verify_contents(local_filepath): self._download_file(external_path, local_filepath) if _verify_contents(local_filepath): checksum = uuid_from_file(local_filepath) - if ( - checksum != contents_hash - ): # this should never happen without outside interference + if checksum != contents_hash: + # this should never happen without outside interference raise DataJointError( "'{file}' downloaded but did not pass checksum'".format( file=local_filepath From 2abac2453024c12594e2ae06ac84ae12bbeba429 Mon Sep 17 00:00:00 2001 From: jverswijver Date: Fri, 27 May 2022 13:34:09 -0500 Subject: [PATCH 14/28] fix style --- datajoint/external.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index 26721169b..bdb42cdde 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -336,12 +336,9 @@ def _verify_contents(local_filepath): file=local_filepath ) ) - if config.get("filepath_checksum_size_limit") and Path( - local_filepath - ).stat().st_size >= config.get("filepath_checksum_size_limit"): - print( - f"WARNING SKIPPED CHECKSUM FOR FILE WITH HASH: {contents_hash}" - ) # This will turn into a proper logger when we implement the datajoint logger + if not _verify_contents(local_filepath): + print(f"WARNING SKIPPED CHECKSUM FOR FILE WITH HASH: {contents_hash}") + # This will turn into a proper logger when we implement the datajoint logger return str(local_filepath), contents_hash # --- UTILITIES --- From c35d509eaee94fe29c47af0de2cce41a0ea72f2d Mon Sep 17 00:00:00 2001 From: jverswijver Date: Fri, 27 May 2022 13:35:39 -0500 Subject: [PATCH 15/28] remove logger --- datajoint/external.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index bdb42cdde..b5991c056 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -9,9 +9,6 @@ from .declare import EXTERNAL_TABLE_ROOT from . import s3 from .utils import safe_write, safe_copy -import logging - -logger = logging.getLogger(__name__) CACHE_SUBFOLDING = ( 2, From 00a4b8a5a857f11f8b0ab8e3835164eb5f417b99 Mon Sep 17 00:00:00 2001 From: jverswijver Date: Fri, 27 May 2022 13:46:23 -0500 Subject: [PATCH 16/28] fix test, bump version --- datajoint/version.py | 2 +- tests/test_filepath.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datajoint/version.py b/datajoint/version.py index 16c0e8ac1..00ba24b09 100644 --- a/datajoint/version.py +++ b/datajoint/version.py @@ -1,3 +1,3 @@ -__version__ = "0.13.5" +__version__ = "0.13.6" assert len(__version__) <= 10 # The log table limits version to the 10 characters diff --git a/tests/test_filepath.py b/tests/test_filepath.py index 851ec0a57..ede187d15 100644 --- a/tests/test_filepath.py +++ b/tests/test_filepath.py @@ -133,7 +133,7 @@ def test_duplicate_error_s3(): def test_filepath_class(table=Filepath(), store="repo", verify_checksum=True): if not verify_checksum: - dj.config["filepath_checksum_size_limit"] = 0 # cant be 0, breaks boolean logic + dj.config["filepath_checksum_size_limit"] = 0 dj.errors._switch_filepath_types(True) stage_path = dj.config["stores"][store]["stage"] # create a mock file From 761fab142e3ea1c960dafad2cbd21446a153f7d2 Mon Sep 17 00:00:00 2001 From: jverswijver Date: Fri, 27 May 2022 13:54:19 -0500 Subject: [PATCH 17/28] apply suggestions from code review. --- datajoint/external.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index b5991c056..050cb3b56 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -305,10 +305,9 @@ def download_filepath(self, filepath_hash): :return: hash (UUID) of the contents of the downloaded file or Nones """ - def _verify_contents(local_filepath): - return config.get("filepath_checksum_size_limit") is None or Path( - local_filepath - ).stat().st_size < config.get("filepath_checksum_size_limit") + def _need_checksum(local_filepath): + limit = config.get("filepath_checksum_size_limit") + return limit is None or Path(local_filepath).stat().st_size < limit if filepath_hash is not None: relative_filepath, contents_hash = (self & {"hash": filepath_hash}).fetch1( @@ -318,13 +317,13 @@ def _verify_contents(local_filepath): local_filepath = Path(self.spec["stage"]).absolute() / relative_filepath file_exists = Path(local_filepath).is_file() and ( - not _verify_contents(local_filepath) + not _need_checksum(local_filepath) or uuid_from_file(local_filepath) == contents_hash ) if not file_exists: self._download_file(external_path, local_filepath) - if _verify_contents(local_filepath): + if _need_checksum(local_filepath): checksum = uuid_from_file(local_filepath) if checksum != contents_hash: # this should never happen without outside interference @@ -333,7 +332,7 @@ def _verify_contents(local_filepath): file=local_filepath ) ) - if not _verify_contents(local_filepath): + if not _need_checksum(local_filepath): print(f"WARNING SKIPPED CHECKSUM FOR FILE WITH HASH: {contents_hash}") # This will turn into a proper logger when we implement the datajoint logger return str(local_filepath), contents_hash From 9fda36174703e613c94eec72220bfd6bbb186a55 Mon Sep 17 00:00:00 2001 From: jverswijver Date: Tue, 31 May 2022 08:21:09 -0500 Subject: [PATCH 18/28] apply suggestions from code review. --- datajoint/external.py | 4 +--- tests/test_filepath.py | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index 050cb3b56..c1d8fd157 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -328,9 +328,7 @@ def _need_checksum(local_filepath): if checksum != contents_hash: # this should never happen without outside interference raise DataJointError( - "'{file}' downloaded but did not pass checksum'".format( - file=local_filepath - ) + f"'{local_filepath}' downloaded but did not pass checksum'" ) if not _need_checksum(local_filepath): print(f"WARNING SKIPPED CHECKSUM FOR FILE WITH HASH: {contents_hash}") diff --git a/tests/test_filepath.py b/tests/test_filepath.py index ede187d15..e579f75d8 100644 --- a/tests/test_filepath.py +++ b/tests/test_filepath.py @@ -171,8 +171,6 @@ def test_filepath_class(table=Filepath(), store="repo", verify_checksum=True): table.external[store].delete(delete_external_files=True) dj.errors._switch_filepath_types(False) dj.config["filepath_checksum_size_limit"] = None - # Todo: assert here that the warning from external.download_file has been logged - # Waiting on DJ Logging implementation def test_filepath_class_again(): From 053c0fc4d5047faa3ecbbe365215c185452af6e1 Mon Sep 17 00:00:00 2001 From: jverswijver Date: Tue, 31 May 2022 09:18:14 -0500 Subject: [PATCH 19/28] change a few .format to use f string. --- datajoint/external.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index c1d8fd157..3d2dc200f 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -72,9 +72,7 @@ def definition(self): @property def table_name(self): - return "{external_table_root}_{store}".format( - external_table_root=EXTERNAL_TABLE_ROOT, store=self.store - ) + return f"{EXTERNAL_TABLE_ROOT}_{self.store}" @property def s3(self): @@ -276,9 +274,7 @@ def upload_filepath(self, local_filepath): # the tracking entry exists, check that it's the same file as before if contents_hash != check_hash[0]: raise DataJointError( - "A different version of '{file}' has already been placed.".format( - file=relative_filepath - ) + f"A different version of '{relative_filepath}' has already been placed." ) else: # upload the file and create its tracking entry From 267095f2263810d368f9f7fc4ef2424334c1b65d Mon Sep 17 00:00:00 2001 From: jverswijver Date: Tue, 31 May 2022 10:22:26 -0500 Subject: [PATCH 20/28] apply suggestion from code review. --- datajoint/external.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index 3d2dc200f..5dc3c8be7 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -320,8 +320,7 @@ def _need_checksum(local_filepath): if not file_exists: self._download_file(external_path, local_filepath) if _need_checksum(local_filepath): - checksum = uuid_from_file(local_filepath) - if checksum != contents_hash: + if uuid_from_file(local_filepath) != contents_hash: # this should never happen without outside interference raise DataJointError( f"'{local_filepath}' downloaded but did not pass checksum'" From bc01762f758bc1d1ad0db25159daeb09b9662400 Mon Sep 17 00:00:00 2001 From: jverswijver Date: Wed, 8 Jun 2022 09:01:07 -0500 Subject: [PATCH 21/28] update test. --- datajoint/external.py | 7 ++++++- tests/test_filepath.py | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/datajoint/external.py b/datajoint/external.py index 5dc3c8be7..212b2d364 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -1,6 +1,7 @@ from pathlib import Path, PurePosixPath, PureWindowsPath from collections.abc import Mapping from tqdm import tqdm +import logging from .settings import config from .errors import DataJointError, MissingExternalFile from .hash import uuid_from_buffer, uuid_from_file @@ -10,6 +11,8 @@ from . import s3 from .utils import safe_write, safe_copy +logger = logging.getLogger(__name__.split(".")[0]) + CACHE_SUBFOLDING = ( 2, 2, @@ -326,7 +329,9 @@ def _need_checksum(local_filepath): f"'{local_filepath}' downloaded but did not pass checksum'" ) if not _need_checksum(local_filepath): - print(f"WARNING SKIPPED CHECKSUM FOR FILE WITH HASH: {contents_hash}") + logger.warning( + f"WARNING SKIPPED CHECKSUM FOR FILE WITH HASH: {contents_hash}" + ) # This will turn into a proper logger when we implement the datajoint logger return str(local_filepath), contents_hash diff --git a/tests/test_filepath.py b/tests/test_filepath.py index e579f75d8..365a7b8a9 100644 --- a/tests/test_filepath.py +++ b/tests/test_filepath.py @@ -4,6 +4,10 @@ from pathlib import Path import random from .schema_external import schema, Filepath, FilepathS3, stores_config +import logging +import io + +logger = logging.getLogger("datajoint") def setUp(self): @@ -188,7 +192,21 @@ def test_filepath_class_s3_again(): def test_filepath_class_no_checksum(): + log_capture = io.StringIO() + stream_handler = logging.StreamHandler(log_capture) + log_format = logging.Formatter( + "[%(asctime)s][%(funcName)s][%(levelname)s]: %(message)s" + ) + stream_handler.setFormatter(log_format) + stream_handler.set_name("test_limit_warning") + logger.addHandler(stream_handler) test_filepath_class(verify_checksum=False) + log_contents = log_capture.getvalue() + log_capture.close() + for handler in logger.handlers: # Clean up handler + if handler.name == "test_limit_warning": + logger.removeHandler(handler) + assert "WARNING SKIPPED CHECKSUM FOR FILE WITH HASH" in log_contents def test_filepath_cleanup(table=Filepath(), store="repo"): From ed3cd01d19d0203f38543c3220aef6f468d14ef1 Mon Sep 17 00:00:00 2001 From: jverswijver Date: Wed, 8 Jun 2022 11:06:55 -0500 Subject: [PATCH 22/28] change error log message. --- datajoint/external.py | 2 +- tests/test_filepath.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index 212b2d364..fd6e13a1a 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -330,7 +330,7 @@ def _need_checksum(local_filepath): ) if not _need_checksum(local_filepath): logger.warning( - f"WARNING SKIPPED CHECKSUM FOR FILE WITH HASH: {contents_hash}" + f"Warning skipped checksum for file with hash: {contents_hash}" ) # This will turn into a proper logger when we implement the datajoint logger return str(local_filepath), contents_hash diff --git a/tests/test_filepath.py b/tests/test_filepath.py index 365a7b8a9..d79c00e5b 100644 --- a/tests/test_filepath.py +++ b/tests/test_filepath.py @@ -206,7 +206,7 @@ def test_filepath_class_no_checksum(): for handler in logger.handlers: # Clean up handler if handler.name == "test_limit_warning": logger.removeHandler(handler) - assert "WARNING SKIPPED CHECKSUM FOR FILE WITH HASH" in log_contents + assert "Warning skipped checksum for file with hash" in log_contents def test_filepath_cleanup(table=Filepath(), store="repo"): From 10a5868288ade9eea1ae9f3a4c3f5fd46380954b Mon Sep 17 00:00:00 2001 From: jverswijver Date: Fri, 10 Jun 2022 09:59:35 -0500 Subject: [PATCH 23/28] apply suggestions from code review. --- datajoint/external.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index fd6e13a1a..513398094 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -309,9 +309,9 @@ def _need_checksum(local_filepath): return limit is None or Path(local_filepath).stat().st_size < limit if filepath_hash is not None: - relative_filepath, contents_hash = (self & {"hash": filepath_hash}).fetch1( - "filepath", "contents_hash" - ) + relative_filepath, contents_hash, size = ( + self & {"hash": filepath_hash} + ).fetch1("filepath", "contents_hash", "size") external_path = self._make_external_filepath(relative_filepath) local_filepath = Path(self.spec["stage"]).absolute() / relative_filepath @@ -329,10 +329,12 @@ def _need_checksum(local_filepath): f"'{local_filepath}' downloaded but did not pass checksum'" ) if not _need_checksum(local_filepath): - logger.warning( - f"Warning skipped checksum for file with hash: {contents_hash}" - ) - # This will turn into a proper logger when we implement the datajoint logger + logger.warning(f"Skipped checksum for file with hash: {contents_hash}") + if size != Path(local_filepath).stat().st_size: + # this should never happen without outside interference + raise DataJointError( + f"'{local_filepath}' downloaded but size is not the same (skipped checksum due to config)'" + ) return str(local_filepath), contents_hash # --- UTILITIES --- From 3ce21f592566543fd3d4719faaeb0188f32223d5 Mon Sep 17 00:00:00 2001 From: jverswijver Date: Fri, 10 Jun 2022 10:02:10 -0500 Subject: [PATCH 24/28] apply suggestions from code review --- datajoint/external.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datajoint/external.py b/datajoint/external.py index 513398094..a30c635df 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -329,7 +329,9 @@ def _need_checksum(local_filepath): f"'{local_filepath}' downloaded but did not pass checksum'" ) if not _need_checksum(local_filepath): - logger.warning(f"Skipped checksum for file with hash: {contents_hash}") + logger.warning( + f"Skipped checksum for file with hash: {contents_hash}, and path: {local_filepath}" + ) if size != Path(local_filepath).stat().st_size: # this should never happen without outside interference raise DataJointError( From 1ecbe5fdc3d4295c5ac1be432c0739ec53ec771d Mon Sep 17 00:00:00 2001 From: jverswijver Date: Fri, 10 Jun 2022 10:06:08 -0500 Subject: [PATCH 25/28] update test. --- tests/test_filepath.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_filepath.py b/tests/test_filepath.py index d79c00e5b..3e94e4885 100644 --- a/tests/test_filepath.py +++ b/tests/test_filepath.py @@ -206,7 +206,7 @@ def test_filepath_class_no_checksum(): for handler in logger.handlers: # Clean up handler if handler.name == "test_limit_warning": logger.removeHandler(handler) - assert "Warning skipped checksum for file with hash" in log_contents + assert "Skipped checksum for file with hash:" in log_contents def test_filepath_cleanup(table=Filepath(), store="repo"): From ce4f1b3db6f31162d19f1eac661eab62e7ebc741 Mon Sep 17 00:00:00 2001 From: jverswijver Date: Fri, 10 Jun 2022 11:30:56 -0500 Subject: [PATCH 26/28] apply suggestions from code review. --- datajoint/external.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index a30c635df..0f18a9a56 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -326,7 +326,7 @@ def _need_checksum(local_filepath): if uuid_from_file(local_filepath) != contents_hash: # this should never happen without outside interference raise DataJointError( - f"'{local_filepath}' downloaded but did not pass checksum'" + f"'{local_filepath}' downloaded but did not pass checksum." ) if not _need_checksum(local_filepath): logger.warning( @@ -335,7 +335,7 @@ def _need_checksum(local_filepath): if size != Path(local_filepath).stat().st_size: # this should never happen without outside interference raise DataJointError( - f"'{local_filepath}' downloaded but size is not the same (skipped checksum due to config)'" + f"'{local_filepath}' downloaded but size is not the same (skipped checksum due to config)." ) return str(local_filepath), contents_hash From 7dabad9e7ab0922de6326276596354044509ca38 Mon Sep 17 00:00:00 2001 From: jverswijver Date: Mon, 13 Jun 2022 11:47:28 -0500 Subject: [PATCH 27/28] update changelog. --- CHANGELOG.md | 6 ++++-- docs-parts/intro/Releases_lang1.rst | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f276ce651..5436bc1db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,8 +2,10 @@ ### 0.13.6 -- Jun 13, 2022 * Add - Config option to set threshold for when to stop using checksums for filepath stores. PR #1025 -* Add - unified package level logger for package (#667) PR #1031 -* Update - swap various datajoint messages, warnings, etc. to use the new logger. (#667) PR #1031 +* Add - Unified package level logger for package (#667) PR #1031 +* Update - Swap various datajoint messages, warnings, etc. to use the new logger. (#667) PR #1031 +* Bugfix - Fix query caching deleting non-datajoint files PR #1027 +* Update - Minimum Python version for Datajoint-Python is now 3.7 PR #1027 ### 0.13.5 -- May 19, 2022 * Update - Import ABC from collections.abc for Python 3.10 compatibility diff --git a/docs-parts/intro/Releases_lang1.rst b/docs-parts/intro/Releases_lang1.rst index d66c039e0..236fd7fb9 100644 --- a/docs-parts/intro/Releases_lang1.rst +++ b/docs-parts/intro/Releases_lang1.rst @@ -1,8 +1,10 @@ ### 0.13.6 -- Jun 13, 2022 ---------------------- * Add - Config option to set threshold for when to stop using checksums for filepath stores. PR #1025 -* Add - unified package level logger for package (#667) PR #1031 -* Update - swap various datajoint messages, warnings, etc. to use the new logger. (#667) PR #1031 +* Add - Unified package level logger for package (#667) PR #1031 +* Update - Swap various datajoint messages, warnings, etc. to use the new logger. (#667) PR #1031 +* Bugfix - Fix query caching deleting non-datajoint files PR #1027 +* Update - Minimum Python version for Datajoint-Python is now 3.7 PR #1027 0.13.5 -- May 19, 2022 ---------------------- From cc803b4cf33f3474ca998a3e22b3c77dbadadc5d Mon Sep 17 00:00:00 2001 From: jverswijver Date: Mon, 13 Jun 2022 14:26:26 -0500 Subject: [PATCH 28/28] apply suggestions from code review. --- datajoint/external.py | 33 ++++++++++++++++------------- docs-parts/intro/Releases_lang1.rst | 2 +- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/datajoint/external.py b/datajoint/external.py index 0f18a9a56..265152cd4 100644 --- a/datajoint/external.py +++ b/datajoint/external.py @@ -304,9 +304,15 @@ def download_filepath(self, filepath_hash): :return: hash (UUID) of the contents of the downloaded file or Nones """ - def _need_checksum(local_filepath): + def _need_checksum(local_filepath, expected_size): limit = config.get("filepath_checksum_size_limit") - return limit is None or Path(local_filepath).stat().st_size < limit + actual_size = Path(local_filepath).stat().st_size + if expected_size != actual_size: + # this should never happen without outside interference + raise DataJointError( + f"'{local_filepath}' downloaded but size did not match." + ) + return limit is None or actual_size < limit if filepath_hash is not None: relative_filepath, contents_hash, size = ( @@ -316,27 +322,24 @@ def _need_checksum(local_filepath): local_filepath = Path(self.spec["stage"]).absolute() / relative_filepath file_exists = Path(local_filepath).is_file() and ( - not _need_checksum(local_filepath) + not _need_checksum(local_filepath, size) or uuid_from_file(local_filepath) == contents_hash ) if not file_exists: self._download_file(external_path, local_filepath) - if _need_checksum(local_filepath): - if uuid_from_file(local_filepath) != contents_hash: - # this should never happen without outside interference - raise DataJointError( - f"'{local_filepath}' downloaded but did not pass checksum." - ) - if not _need_checksum(local_filepath): - logger.warning( - f"Skipped checksum for file with hash: {contents_hash}, and path: {local_filepath}" - ) - if size != Path(local_filepath).stat().st_size: + if ( + _need_checksum(local_filepath, size) + and uuid_from_file(local_filepath) != contents_hash + ): # this should never happen without outside interference raise DataJointError( - f"'{local_filepath}' downloaded but size is not the same (skipped checksum due to config)." + f"'{local_filepath}' downloaded but did not pass checksum." ) + if not _need_checksum(local_filepath, size): + logger.warning( + f"Skipped checksum for file with hash: {contents_hash}, and path: {local_filepath}" + ) return str(local_filepath), contents_hash # --- UTILITIES --- diff --git a/docs-parts/intro/Releases_lang1.rst b/docs-parts/intro/Releases_lang1.rst index 236fd7fb9..6a803eebe 100644 --- a/docs-parts/intro/Releases_lang1.rst +++ b/docs-parts/intro/Releases_lang1.rst @@ -1,4 +1,4 @@ -### 0.13.6 -- Jun 13, 2022 +0.13.6 -- Jun 13, 2022 ---------------------- * Add - Config option to set threshold for when to stop using checksums for filepath stores. PR #1025 * Add - Unified package level logger for package (#667) PR #1031