Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fetch: trust remote checksums #3200

Merged
merged 7 commits into from
Jan 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions dvc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ class Config(object): # pylint: disable=too-many-instance-attributes
SECTION_REMOTE_ASK_PASSWORD = "ask_password"
SECTION_REMOTE_GSS_AUTH = "gss_auth"
SECTION_REMOTE_NO_TRAVERSE = "no_traverse"
SECTION_REMOTE_VERIFY = "verify"
SECTION_REMOTE_SCHEMA = {
Required(SECTION_REMOTE_URL): str,
SECTION_AWS_REGION: str,
Expand Down Expand Up @@ -232,6 +233,7 @@ class Config(object): # pylint: disable=too-many-instance-attributes
SECTION_GDRIVE_USER_CREDENTIALS_FILE: str,
PRIVATE_CWD: str,
SECTION_REMOTE_NO_TRAVERSE: Bool,
SECTION_REMOTE_VERIFY: Bool,
}

SECTION_STATE = "state"
Expand Down
22 changes: 17 additions & 5 deletions dvc/data_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,25 @@ def pull(self, cache, jobs=None, remote=None, show_checksums=False):
show_checksums (bool): show checksums instead of file names in
information messages.
"""
return self.repo.cache.local.pull(
cache,
jobs=jobs,
remote=self.get_remote(remote, "pull"),
show_checksums=show_checksums,
remote = self.get_remote(remote, "pull")
downloaded_items_num = self.repo.cache.local.pull(
cache, jobs=jobs, remote=remote, show_checksums=show_checksums
)

if not remote.verify:
self._save_pulled_checksums(cache)

return downloaded_items_num

def _save_pulled_checksums(self, cache):
for checksum in cache["local"].keys():
cache_file = self.repo.cache.local.checksum_to_path_info(checksum)
if self.repo.cache.local.exists(cache_file):
efiop marked this conversation as resolved.
Show resolved Hide resolved
# We can safely save here, as existing corrupted files will be
# removed upon status, while files corrupted during download
# will not be moved from tmp_file (see `RemoteBASE.download()`)
self.repo.state.save(cache_file, checksum)

def status(self, cache, jobs=None, remote=None, show_checksums=False):
"""Check status of data items in a cloud-agnostic way.

Expand Down
4 changes: 4 additions & 0 deletions dvc/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ class RemoteBASE(object):
CHECKSUM_JOBS = max(1, min(4, cpu_count() // 2))
DEFAULT_CACHE_TYPES = ["copy"]
DEFAULT_NO_TRAVERSE = True
DEFAULT_VERIFY = False

state = StateNoop()

Expand All @@ -93,6 +94,9 @@ def __init__(self, repo, config):
self.no_traverse = config.get(
Config.SECTION_REMOTE_NO_TRAVERSE, self.DEFAULT_NO_TRAVERSE
)
self.verify = config.get(
Config.SECTION_REMOTE_VERIFY, self.DEFAULT_VERIFY
)
self._dir_info = {}

types = config.get(Config.SECTION_CACHE_TYPE, None)
Expand Down
1 change: 1 addition & 0 deletions dvc/remote/gdrive.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ class RemoteGDrive(RemoteBASE):
path_cls = CloudURLInfo
REQUIRES = {"pydrive2": "pydrive2"}
DEFAULT_NO_TRAVERSE = False
DEFAULT_VERIFY = True

GDRIVE_USER_CREDENTIALS_DATA = "GDRIVE_USER_CREDENTIALS_DATA"
DEFAULT_USER_CREDENTIALS_FILE = "gdrive-user-credentials.json"
Expand Down
37 changes: 36 additions & 1 deletion tests/func/test_data_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
from mock import patch

from dvc.cache import NamedCache
from dvc.compat import fspath
from dvc.config import Config
from dvc.data_cloud import DataCloud
from dvc.main import main
from dvc.remote import RemoteAZURE
from dvc.remote import RemoteAZURE, RemoteConfig
from dvc.remote import RemoteGDrive
from dvc.remote import RemoteGS
from dvc.remote import RemoteHDFS
Expand Down Expand Up @@ -653,3 +654,37 @@ def test(self):
assert self.message_header in self._caplog.text
assert self.message_foo_part in self._caplog.text
assert self.message_bar_part in self._caplog.text


def test_verify_checksums(tmp_dir, scm, dvc, mocker, tmp_path_factory):
tmp_dir.dvc_gen({"file": "file1 content"}, commit="add file")
tmp_dir.dvc_gen({"dir": {"subfile": "file2 content"}}, commit="add dir")

RemoteConfig(dvc.config).add(
"local_remote",
fspath(tmp_path_factory.mktemp("local_remote")),
default=True,
)
dvc.push()

# remove artifacts and cache to trigger fetching
os.remove("file")
shutil.rmtree("dir")
shutil.rmtree(dvc.cache.local.cache_dir)

checksum_spy = mocker.spy(dvc.cache.local, "get_file_checksum")

dvc.pull()
assert checksum_spy.call_count == 0

# Removing cache will invalidate existing state entries
shutil.rmtree(dvc.cache.local.cache_dir)

dvc.config.set(
Config.SECTION_REMOTE_FMT.format("local_remote"),
Config.SECTION_REMOTE_VERIFY,
"True",
)

dvc.pull()
assert checksum_spy.call_count == 3