Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to disable checksums for external files and blobs #1025

Merged
merged 32 commits into from
Jun 13, 2022
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
2a57998
Add option to disable checksums
jverswijver May 17, 2022
45d1250
fix minor bug.
jverswijver May 17, 2022
c0775ba
Revert commits
jverswijver May 19, 2022
6cee155
update checksum disable.
jverswijver May 25, 2022
f949e38
Refactor logic for if statements.
jverswijver May 25, 2022
1ab945c
add logging
jverswijver May 25, 2022
dcaea18
add logging, add test, update changelog.
jverswijver May 27, 2022
4c1ee16
Merge branch 'master' into remove_checksum
jverswijver May 27, 2022
5715b77
update changelog.
jverswijver May 27, 2022
aed8bf1
minor change to comment.
jverswijver May 27, 2022
e64f32b
minor changes to test and logger
jverswijver May 27, 2022
52902a6
fix style
jverswijver May 27, 2022
21d75e6
update logic
jverswijver May 27, 2022
f49b178
fix style
jverswijver May 27, 2022
2abac24
fix style
jverswijver May 27, 2022
c35d509
remove logger
jverswijver May 27, 2022
00a4b8a
fix test, bump version
jverswijver May 27, 2022
761fab1
apply suggestions from code review.
jverswijver May 27, 2022
9fda361
apply suggestions from code review.
jverswijver May 31, 2022
053c0fc
change a few .format to use f string.
jverswijver May 31, 2022
267095f
apply suggestion from code review.
jverswijver May 31, 2022
7d7c42a
Merge branch 'add_logger' of https://github.com/jverswijver/datajoint…
jverswijver Jun 8, 2022
bc01762
update test.
jverswijver Jun 8, 2022
ed3cd01
change error log message.
jverswijver Jun 8, 2022
10a5868
apply suggestions from code review.
jverswijver Jun 10, 2022
3ce21f5
apply suggestions from code review
jverswijver Jun 10, 2022
1ecbe5f
update test.
jverswijver Jun 10, 2022
ce4f1b3
apply suggestions from code review.
jverswijver Jun 10, 2022
eac203f
Merge branch 'add_logger' of https://github.com/jverswijver/datajoint…
jverswijver Jun 10, 2022
dd3a249
Merge branch 'master' of https://github.com/jverswijver/datajoint-pyt…
jverswijver Jun 13, 2022
7dabad9
update changelog.
jverswijver Jun 13, 2022
cc803b4
apply suggestions from code review.
jverswijver Jun 13, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
## Release notes

### 0.13.6 -- TBD
* Add - Config option to set threshold for when to stop using checksums for filepath stores. PR #1025

### 0.13.5 -- May 19, 2022
* Update - Import ABC from collections.abc for Python 3.10 compatibility
* Bugfix - Fix multiprocessing value error (#1013) PR #1026
Expand Down
43 changes: 31 additions & 12 deletions datajoint/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
from .declare import EXTERNAL_TABLE_ROOT
from . import s3
from .utils import safe_write, safe_copy
import logging

logger = logging.getLogger(__name__)

CACHE_SUBFOLDING = (
2,
Expand Down Expand Up @@ -310,21 +313,37 @@ def download_filepath(self, filepath_hash):
)
external_path = self._make_external_filepath(relative_filepath)
local_filepath = Path(self.spec["stage"]).absolute() / relative_filepath
file_exists = (
Path(local_filepath).is_file()
and uuid_from_file(local_filepath) == contents_hash

file_exists = Path(local_filepath).is_file() and (
(
config.get("filepath_checksum_size_limit") # check if None
jverswijver marked this conversation as resolved.
Show resolved Hide resolved
and Path(local_filepath).stat().st_size
>= config.get("filepath_checksum_size_limit")
)
or uuid_from_file(local_filepath) == contents_hash
)

if not file_exists:
self._download_file(external_path, local_filepath)
checksum = uuid_from_file(local_filepath)
if (
checksum != contents_hash
): # this should never happen without outside interference
raise DataJointError(
"'{file}' downloaded but did not pass checksum'".format(
file=local_filepath
if not config.get("filepath_checksum_size_limit") or (
Path(local_filepath).stat().st_size
< config.get("filepath_checksum_size_limit")
):
checksum = uuid_from_file(local_filepath)
if (
checksum != contents_hash
): # this should never happen without outside interference
raise DataJointError(
"'{file}' downloaded but did not pass checksum'".format(
jverswijver marked this conversation as resolved.
Show resolved Hide resolved
file=local_filepath
)
jverswijver marked this conversation as resolved.
Show resolved Hide resolved
)
)
if config.get("filepath_checksum_size_limit") and Path(
local_filepath
).stat().st_size >= config.get("filepath_checksum_size_limit"):
logger.warning(
f"WARNING SKIPPED CHECKSUM FOR FILE WITH HASH: {contents_hash}"
jverswijver marked this conversation as resolved.
Show resolved Hide resolved
)
return str(local_filepath), contents_hash

# --- UTILITIES ---
Expand Down Expand Up @@ -402,7 +421,7 @@ def delete(
delete_external_files=None,
limit=None,
display_progress=True,
errors_as_string=True
errors_as_string=True,
):
"""

Expand Down
1 change: 1 addition & 0 deletions datajoint/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
"display.show_tuple_count": True,
"database.use_tls": None,
"enable_python_native_blobs": True, # python-native/dj0 encoding support
"filepath_checksum_size_limit": None, # file size limit for when to disable checksums
}
)

Expand Down
4 changes: 4 additions & 0 deletions docs-parts/intro/Releases_lang1.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
0.13.6 -- TBD
----------------------
* Add - Config option to set threshold for when to stop using checksums for filepath stores. PR #1025

0.13.5 -- May 19, 2022
----------------------
* Update - Import ABC from collections.abc for Python 3.10 compatibility
Expand Down
13 changes: 11 additions & 2 deletions tests/test_filepath.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
from pathlib import Path
import random

import logging
from .schema_external import schema, Filepath, FilepathS3, stores_config


Expand Down Expand Up @@ -132,7 +132,9 @@ def test_duplicate_error_s3():
test_duplicate_error(store="repo-s3")


def test_filepath_class(table=Filepath(), store="repo"):
def test_filepath_class(table=Filepath(), store="repo", verify_checksum=True):
if not verify_checksum:
dj.config["filepath_checksum_size_limit"] = 1 # cant be 0, breaks boolean logic
dj.errors._switch_filepath_types(True)
stage_path = dj.config["stores"][store]["stage"]
# create a mock file
Expand Down Expand Up @@ -169,6 +171,9 @@ def test_filepath_class(table=Filepath(), store="repo"):
# delete from external table
table.external[store].delete(delete_external_files=True)
dj.errors._switch_filepath_types(False)
dj.config["filepath_checksum_size_limit"] = None
# Todo: assert here that the warning from external.download_file has been logged
# Waiting on DJ Logging implementation
jverswijver marked this conversation as resolved.
Show resolved Hide resolved


def test_filepath_class_again():
Expand All @@ -185,6 +190,10 @@ def test_filepath_class_s3_again():
test_filepath_class(FilepathS3(), "repo-s3")


def test_filepath_class_no_checksum():
test_filepath_class(verify_checksum=False)


def test_filepath_cleanup(table=Filepath(), store="repo"):
"""test deletion of filepath entries from external table"""

Expand Down