diff --git a/dvc/config.py b/dvc/config.py index 65250c76de..0759395d04 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -198,6 +198,7 @@ class Config(object): # pylint: disable=too-many-instance-attributes SECTION_REMOTE_ASK_PASSWORD = "ask_password" SECTION_REMOTE_GSS_AUTH = "gss_auth" SECTION_REMOTE_NO_TRAVERSE = "no_traverse" + SECTION_REMOTE_VERIFY = "verify" SECTION_REMOTE_SCHEMA = { Required(SECTION_REMOTE_URL): str, SECTION_AWS_REGION: str, @@ -232,6 +233,7 @@ class Config(object): # pylint: disable=too-many-instance-attributes SECTION_GDRIVE_USER_CREDENTIALS_FILE: str, PRIVATE_CWD: str, SECTION_REMOTE_NO_TRAVERSE: Bool, + SECTION_REMOTE_VERIFY: Bool, } SECTION_STATE = "state" diff --git a/dvc/data_cloud.py b/dvc/data_cloud.py index fadd536bd0..e85c8b2808 100644 --- a/dvc/data_cloud.py +++ b/dvc/data_cloud.py @@ -73,13 +73,25 @@ def pull(self, cache, jobs=None, remote=None, show_checksums=False): show_checksums (bool): show checksums instead of file names in information messages. """ - return self.repo.cache.local.pull( - cache, - jobs=jobs, - remote=self.get_remote(remote, "pull"), - show_checksums=show_checksums, + remote = self.get_remote(remote, "pull") + downloaded_items_num = self.repo.cache.local.pull( + cache, jobs=jobs, remote=remote, show_checksums=show_checksums ) + if not remote.verify: + self._save_pulled_checksums(cache) + + return downloaded_items_num + + def _save_pulled_checksums(self, cache): + for checksum in cache["local"].keys(): + cache_file = self.repo.cache.local.checksum_to_path_info(checksum) + if self.repo.cache.local.exists(cache_file): + # We can safely save here, as existing corrupted files will be + # removed upon status, while files corrupted during download + # will not be moved from tmp_file (see `RemoteBASE.download()`) + self.repo.state.save(cache_file, checksum) + def status(self, cache, jobs=None, remote=None, show_checksums=False): """Check status of data items in a cloud-agnostic way. diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 788742c8c1..7262ba1a9c 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -80,6 +80,7 @@ class RemoteBASE(object): CHECKSUM_JOBS = max(1, min(4, cpu_count() // 2)) DEFAULT_CACHE_TYPES = ["copy"] DEFAULT_NO_TRAVERSE = True + DEFAULT_VERIFY = False state = StateNoop() @@ -93,6 +94,9 @@ def __init__(self, repo, config): self.no_traverse = config.get( Config.SECTION_REMOTE_NO_TRAVERSE, self.DEFAULT_NO_TRAVERSE ) + self.verify = config.get( + Config.SECTION_REMOTE_VERIFY, self.DEFAULT_VERIFY + ) self._dir_info = {} types = config.get(Config.SECTION_CACHE_TYPE, None) diff --git a/dvc/remote/gdrive.py b/dvc/remote/gdrive.py index a38b2a1a2b..24d65aa256 100644 --- a/dvc/remote/gdrive.py +++ b/dvc/remote/gdrive.py @@ -63,6 +63,7 @@ class RemoteGDrive(RemoteBASE): path_cls = CloudURLInfo REQUIRES = {"pydrive2": "pydrive2"} DEFAULT_NO_TRAVERSE = False + DEFAULT_VERIFY = True GDRIVE_USER_CREDENTIALS_DATA = "GDRIVE_USER_CREDENTIALS_DATA" DEFAULT_USER_CREDENTIALS_FILE = "gdrive-user-credentials.json" diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index 748a7fe631..137fd674c8 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -9,10 +9,11 @@ from mock import patch from dvc.cache import NamedCache +from dvc.compat import fspath from dvc.config import Config from dvc.data_cloud import DataCloud from dvc.main import main -from dvc.remote import RemoteAZURE +from dvc.remote import RemoteAZURE, RemoteConfig from dvc.remote import RemoteGDrive from dvc.remote import RemoteGS from dvc.remote import RemoteHDFS @@ -653,3 +654,37 @@ def test(self): assert self.message_header in self._caplog.text assert self.message_foo_part in self._caplog.text assert self.message_bar_part in self._caplog.text + + +def test_verify_checksums(tmp_dir, scm, dvc, mocker, tmp_path_factory): + tmp_dir.dvc_gen({"file": "file1 content"}, commit="add file") + tmp_dir.dvc_gen({"dir": {"subfile": "file2 content"}}, commit="add dir") + + RemoteConfig(dvc.config).add( + "local_remote", + fspath(tmp_path_factory.mktemp("local_remote")), + default=True, + ) + dvc.push() + + # remove artifacts and cache to trigger fetching + os.remove("file") + shutil.rmtree("dir") + shutil.rmtree(dvc.cache.local.cache_dir) + + checksum_spy = mocker.spy(dvc.cache.local, "get_file_checksum") + + dvc.pull() + assert checksum_spy.call_count == 0 + + # Removing cache will invalidate existing state entries + shutil.rmtree(dvc.cache.local.cache_dir) + + dvc.config.set( + Config.SECTION_REMOTE_FMT.format("local_remote"), + Config.SECTION_REMOTE_VERIFY, + "True", + ) + + dvc.pull() + assert checksum_spy.call_count == 3