Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to disable checksums for external files and blobs #1025

Merged
merged 32 commits into from
Jun 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
2a57998
Add option to disable checksums
jverswijver May 17, 2022
45d1250
fix minor bug.
jverswijver May 17, 2022
c0775ba
Revert commits
jverswijver May 19, 2022
6cee155
update checksum disable.
jverswijver May 25, 2022
f949e38
Refactor logic for if statements.
jverswijver May 25, 2022
1ab945c
add logging
jverswijver May 25, 2022
dcaea18
add logging, add test, update changelog.
jverswijver May 27, 2022
4c1ee16
Merge branch 'master' into remove_checksum
jverswijver May 27, 2022
5715b77
update changelog.
jverswijver May 27, 2022
aed8bf1
minor change to comment.
jverswijver May 27, 2022
e64f32b
minor changes to test and logger
jverswijver May 27, 2022
52902a6
fix style
jverswijver May 27, 2022
21d75e6
update logic
jverswijver May 27, 2022
f49b178
fix style
jverswijver May 27, 2022
2abac24
fix style
jverswijver May 27, 2022
c35d509
remove logger
jverswijver May 27, 2022
00a4b8a
fix test, bump version
jverswijver May 27, 2022
761fab1
apply suggestions from code review.
jverswijver May 27, 2022
9fda361
apply suggestions from code review.
jverswijver May 31, 2022
053c0fc
change a few .format to use f string.
jverswijver May 31, 2022
267095f
apply suggestion from code review.
jverswijver May 31, 2022
7d7c42a
Merge branch 'add_logger' of https://github.com/jverswijver/datajoint…
jverswijver Jun 8, 2022
bc01762
update test.
jverswijver Jun 8, 2022
ed3cd01
change error log message.
jverswijver Jun 8, 2022
10a5868
apply suggestions from code review.
jverswijver Jun 10, 2022
3ce21f5
apply suggestions from code review
jverswijver Jun 10, 2022
1ecbe5f
update test.
jverswijver Jun 10, 2022
ce4f1b3
apply suggestions from code review.
jverswijver Jun 10, 2022
eac203f
Merge branch 'add_logger' of https://github.com/jverswijver/datajoint…
jverswijver Jun 10, 2022
dd3a249
Merge branch 'master' of https://github.com/jverswijver/datajoint-pyt…
jverswijver Jun 13, 2022
7dabad9
update changelog.
jverswijver Jun 13, 2022
cc803b4
apply suggestions from code review.
jverswijver Jun 13, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
## Release notes

### 0.13.6 -- Jun 13, 2022
* Add - unified package level logger for package (#667) PR #1031
* Update - swap various datajoint messages, warnings, etc. to use the new logger. (#667) PR #1031
* Add - Config option to set threshold for when to stop using checksums for filepath stores. PR #1025
* Add - Unified package level logger for package (#667) PR #1031
* Update - Swap various datajoint messages, warnings, etc. to use the new logger. (#667) PR #1031
* Bugfix - Fix query caching deleting non-datajoint files PR #1027
* Update - Minimum Python version for Datajoint-Python is now 3.7 PR #1027

### 0.13.5 -- May 19, 2022
* Update - Import ABC from collections.abc for Python 3.10 compatibility
Expand Down
53 changes: 34 additions & 19 deletions datajoint/external.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path, PurePosixPath, PureWindowsPath
from collections.abc import Mapping
from tqdm import tqdm
import logging
from .settings import config
from .errors import DataJointError, MissingExternalFile
from .hash import uuid_from_buffer, uuid_from_file
Expand All @@ -10,6 +11,8 @@
from . import s3
from .utils import safe_write, safe_copy

logger = logging.getLogger(__name__.split(".")[0])

CACHE_SUBFOLDING = (
2,
2,
Expand Down Expand Up @@ -72,9 +75,7 @@ def definition(self):

@property
def table_name(self):
return "{external_table_root}_{store}".format(
external_table_root=EXTERNAL_TABLE_ROOT, store=self.store
)
return f"{EXTERNAL_TABLE_ROOT}_{self.store}"

@property
def s3(self):
Expand Down Expand Up @@ -276,9 +277,7 @@ def upload_filepath(self, local_filepath):
# the tracking entry exists, check that it's the same file as before
if contents_hash != check_hash[0]:
raise DataJointError(
"A different version of '{file}' has already been placed.".format(
file=relative_filepath
)
f"A different version of '{relative_filepath}' has already been placed."
)
else:
# upload the file and create its tracking entry
Expand All @@ -304,27 +303,43 @@ def download_filepath(self, filepath_hash):
:param filepath_hash: The hash (UUID) of the relative_path
:return: hash (UUID) of the contents of the downloaded file or Nones
"""

def _need_checksum(local_filepath, expected_size):
limit = config.get("filepath_checksum_size_limit")
actual_size = Path(local_filepath).stat().st_size
if expected_size != actual_size:
# this should never happen without outside interference
raise DataJointError(
f"'{local_filepath}' downloaded but size did not match."
)
return limit is None or actual_size < limit

if filepath_hash is not None:
relative_filepath, contents_hash = (self & {"hash": filepath_hash}).fetch1(
"filepath", "contents_hash"
)
relative_filepath, contents_hash, size = (
self & {"hash": filepath_hash}
).fetch1("filepath", "contents_hash", "size")
external_path = self._make_external_filepath(relative_filepath)
local_filepath = Path(self.spec["stage"]).absolute() / relative_filepath
file_exists = (
Path(local_filepath).is_file()
and uuid_from_file(local_filepath) == contents_hash

file_exists = Path(local_filepath).is_file() and (
not _need_checksum(local_filepath, size)
or uuid_from_file(local_filepath) == contents_hash
)

if not file_exists:
self._download_file(external_path, local_filepath)
checksum = uuid_from_file(local_filepath)
if (
checksum != contents_hash
): # this should never happen without outside interference
_need_checksum(local_filepath, size)
and uuid_from_file(local_filepath) != contents_hash
):
# this should never happen without outside interference
raise DataJointError(
"'{file}' downloaded but did not pass checksum'".format(
file=local_filepath
)
f"'{local_filepath}' downloaded but did not pass checksum."
)
if not _need_checksum(local_filepath, size):
logger.warning(
f"Skipped checksum for file with hash: {contents_hash}, and path: {local_filepath}"
)
return str(local_filepath), contents_hash

# --- UTILITIES ---
Expand Down Expand Up @@ -402,7 +417,7 @@ def delete(
delete_external_files=None,
limit=None,
display_progress=True,
errors_as_string=True
errors_as_string=True,
):
"""

Expand Down
1 change: 1 addition & 0 deletions datajoint/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
"display.show_tuple_count": True,
"database.use_tls": None,
"enable_python_native_blobs": True, # python-native/dj0 encoding support
"filepath_checksum_size_limit": None, # file size limit for when to disable checksums
}
)

Expand Down
7 changes: 5 additions & 2 deletions docs-parts/intro/Releases_lang1.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
0.13.6 -- Jun 13, 2022
----------------------
* Add - unified package level logger for package (#667) PR #1031
* Update - swap various datajoint messages, warnings, etc. to use the new logger. (#667) PR #1031
* Add - Config option to set threshold for when to stop using checksums for filepath stores. PR #1025
* Add - Unified package level logger for package (#667) PR #1031
* Update - Swap various datajoint messages, warnings, etc. to use the new logger. (#667) PR #1031
* Bugfix - Fix query caching deleting non-datajoint files PR #1027
* Update - Minimum Python version for Datajoint-Python is now 3.7 PR #1027

0.13.5 -- May 19, 2022
----------------------
Expand Down
28 changes: 26 additions & 2 deletions tests/test_filepath.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
import os
from pathlib import Path
import random

from .schema_external import schema, Filepath, FilepathS3, stores_config
import logging
import io

logger = logging.getLogger("datajoint")


def setUp(self):
Expand Down Expand Up @@ -132,7 +135,9 @@ def test_duplicate_error_s3():
test_duplicate_error(store="repo-s3")


def test_filepath_class(table=Filepath(), store="repo"):
def test_filepath_class(table=Filepath(), store="repo", verify_checksum=True):
if not verify_checksum:
dj.config["filepath_checksum_size_limit"] = 0
dj.errors._switch_filepath_types(True)
stage_path = dj.config["stores"][store]["stage"]
# create a mock file
Expand Down Expand Up @@ -169,6 +174,7 @@ def test_filepath_class(table=Filepath(), store="repo"):
# delete from external table
table.external[store].delete(delete_external_files=True)
dj.errors._switch_filepath_types(False)
dj.config["filepath_checksum_size_limit"] = None


def test_filepath_class_again():
Expand All @@ -185,6 +191,24 @@ def test_filepath_class_s3_again():
test_filepath_class(FilepathS3(), "repo-s3")


def test_filepath_class_no_checksum():
log_capture = io.StringIO()
stream_handler = logging.StreamHandler(log_capture)
log_format = logging.Formatter(
"[%(asctime)s][%(funcName)s][%(levelname)s]: %(message)s"
)
stream_handler.setFormatter(log_format)
stream_handler.set_name("test_limit_warning")
logger.addHandler(stream_handler)
test_filepath_class(verify_checksum=False)
log_contents = log_capture.getvalue()
log_capture.close()
for handler in logger.handlers: # Clean up handler
if handler.name == "test_limit_warning":
logger.removeHandler(handler)
assert "Skipped checksum for file with hash:" in log_contents


def test_filepath_cleanup(table=Filepath(), store="repo"):
"""test deletion of filepath entries from external table"""

Expand Down