From 839fbcb022c18f1dad1df7de28484f1f9906a2b7 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Tue, 25 Feb 2020 14:58:36 +0545 Subject: [PATCH 1/6] checkout: implement --show-changes flag to show detailed checkout details Show summary by default on checkout checkout: show directory modified even when files are removed from inside it checkout: refactor template logic checkout: change output format checkout: do not show summary if nothing's changed checkout: do not show summary if show_changes is specified checkout: show '/' at the end if it's a directory rename variable name test: cover stats for checkout checkout: show relink as not modified checkout: remove unwanted message during empty checkout pull: show summary on checkout import/get: change external_repo to catch CheckoutError fix tests and check if path is url checkout: support external deps/outputs tests: checkout on external outputs --- dvc/command/checkout.py | 96 ++++++++++- dvc/command/data_sync.py | 12 +- dvc/exceptions.py | 4 +- dvc/external_repo.py | 8 +- dvc/remote/base.py | 36 +++- dvc/repo/checkout.py | 55 +++++-- dvc/repo/pull.py | 6 +- dvc/stage.py | 31 ++-- dvc/state.py | 16 +- tests/func/test_checkout.py | 244 +++++++++++++++++++++++++++- tests/func/test_state.py | 28 +++- tests/unit/command/test_checkout.py | 57 ++++++- 12 files changed, 522 insertions(+), 71 deletions(-) diff --git a/dvc/command/checkout.py b/dvc/command/checkout.py index 0e2a533ec5..cbf8b29b6d 100644 --- a/dvc/command/checkout.py +++ b/dvc/command/checkout.py @@ -1,18 +1,94 @@ import argparse +import logging + +import colorama from dvc.command.base import append_doc_link from dvc.command.base import CmdBase +from dvc.exceptions import CheckoutError + +logger = logging.getLogger(__name__) + + +def _human_join(words=None): + words = list(words or []) + if not words: + return "" + + return ( + "{before} and {after}".format( + before=", ".join(words[:-1]), after=words[-1], + ) + if len(words) > 1 + else words[0] + ) + + +def log_summary(stats): + message = [ + ("added", "{added} added"), + ("deleted", "{deleted} deleted"), + ("modified", "{modified} modified"), + ] + summary = { + stat: len(stats[stat]) for stat, num in message if stats.get(stat) + } + if not summary: + return + + template = _human_join( + fragment for stat, fragment in message if stat in summary + ) + logger.info(template.format_map(summary)) + + +def log_changes(stats): + colors = [ + ("modified", colorama.Fore.YELLOW,), + ("added", colorama.Fore.GREEN), + ("deleted", colorama.Fore.RED,), + ] + + for state, color in colors: + entries = stats.get(state) + + if not entries: + continue + + for entry in entries: + logger.info( + "{color}{state}{nc}{spacing}{entry}".format( + color=color, + state=state[0].capitalize(), + nc=colorama.Fore.RESET, + spacing="\t", + entry=entry, + ) + ) class CmdCheckout(CmdBase): def run(self): - self.repo.checkout( - targets=self.args.targets, - with_deps=self.args.with_deps, - force=self.args.force, - relink=self.args.relink, - recursive=self.args.recursive, - ) + stats, exc = None, None + try: + stats = self.repo.checkout( + targets=self.args.targets, + with_deps=self.args.with_deps, + force=self.args.force, + relink=self.args.relink, + recursive=self.args.recursive, + ) + except CheckoutError as _exc: + exc = _exc + stats = exc.stats + + if self.args.show_changes: + log_changes(stats) + else: + log_summary(stats) + + if exc: + raise exc return 0 @@ -26,6 +102,12 @@ def add_parser(subparsers, parent_parser): help=CHECKOUT_HELP, formatter_class=argparse.RawDescriptionHelpFormatter, ) + checkout_parser.add_argument( + "--show-changes", + action="store_true", + default=False, + help="Show list of changes", + ) checkout_parser.add_argument( "-d", "--with-deps", diff --git a/dvc/command/data_sync.py b/dvc/command/data_sync.py index c12b1afcac..7be18cc747 100644 --- a/dvc/command/data_sync.py +++ b/dvc/command/data_sync.py @@ -3,7 +3,8 @@ from dvc.command.base import append_doc_link from dvc.command.base import CmdBase -from dvc.exceptions import DvcException +from dvc.command.checkout import log_summary +from dvc.exceptions import DvcException, CheckoutError logger = logging.getLogger(__name__) @@ -19,7 +20,7 @@ def check_up_to_date(cls, processed_files_count): class CmdDataPull(CmdDataBase): def run(self): try: - processed_files_count = self.repo.pull( + stats = self.repo.pull( targets=self.args.targets, jobs=self.args.jobs, remote=self.args.remote, @@ -29,10 +30,13 @@ def run(self): force=self.args.force, recursive=self.args.recursive, ) - except DvcException: + log_summary(stats) + except (CheckoutError, DvcException) as exc: + log_summary(getattr(exc, "stats", {})) logger.exception("failed to pull data from the cloud") return 1 - self.check_up_to_date(processed_files_count) + + self.check_up_to_date(stats["downloaded"]) return 0 diff --git a/dvc/exceptions.py b/dvc/exceptions.py index 2f6c25a47b..18d02e1e9f 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -256,7 +256,9 @@ def __init__(self, amount): class CheckoutError(DvcException): - def __init__(self, target_infos): + def __init__(self, target_infos, stats=None): + self.target_infos = target_infos + self.stats = stats targets = [str(t) for t in target_infos] m = ( "Checkout failed for following targets:\n {}\nDid you " diff --git a/dvc/external_repo.py b/dvc/external_repo.py index c16d55b584..2c2154f71d 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -11,7 +11,7 @@ from dvc.compat import fspath from dvc.repo import Repo from dvc.config import NoRemoteError, NotDvcRepoError -from dvc.exceptions import NoRemoteInExternalRepoError +from dvc.exceptions import NoRemoteInExternalRepoError, CheckoutError from dvc.exceptions import OutputNotFoundError, NoOutputInExternalRepoError from dvc.exceptions import FileMissingError, PathMissingError from dvc.utils.fs import remove, fs_copy, move @@ -106,7 +106,11 @@ def _pull_cached(self, out, path_info, dest): if out.changed_cache(filter_info=src): self.cloud.pull(out.get_used_cache(filter_info=src)) - failed = out.checkout(filter_info=src) + failed = False + try: + out.checkout(filter_info=src) + except CheckoutError: + failed = True move(src, dest) remove(tmp) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 2d2cf4748e..cb6be34bcd 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -14,6 +14,7 @@ import dvc.prompt as prompt from dvc.exceptions import ( + CheckoutError, DvcException, ConfirmRemoveError, DvcIgnoreInCollectedDirError, @@ -851,13 +852,15 @@ def safe_remove(self, path_info, force=False): self.remove(path_info) def _checkout_file( - self, path_info, checksum, force, progress_callback=None + self, path_info, checksum, force, progress_callback=None, relink=False ): """The file is changed we need to checkout a new copy""" + added, modified = True, False cache_info = self.checksum_to_path_info(checksum) if self.exists(path_info): logger.debug("data '{}' will be replaced.", path_info) self.safe_remove(path_info, force=force) + added, modified = False, True self.link(cache_info, path_info) self.state.save_link(path_info) @@ -865,6 +868,9 @@ def _checkout_file( if progress_callback: progress_callback(str(path_info)) + # relink is not modified + return added, modified and not relink + def makedirs(self, path_info): """Optional: Implement only if the remote needs to create directories before copying/linking/moving data @@ -879,9 +885,11 @@ def _checkout_dir( relink=False, filter_info=None, ): + added, modified = False, False # Create dir separately so that dir is created # even if there are no files in it if not self.exists(path_info): + added = True self.makedirs(path_info) dir_info = self.get_dir_cache(checksum) @@ -899,27 +907,36 @@ def _checkout_dir( entry_checksum_info = {self.PARAM_CHECKSUM: entry_checksum} if relink or self.changed(entry_info, entry_checksum_info): + modified = True self.safe_remove(entry_info, force=force) self.link(entry_cache_info, entry_info) self.state.save(entry_info, entry_checksum) if progress_callback: progress_callback(str(entry_info)) - self._remove_redundant_files(path_info, dir_info, force) + modified = ( + self._remove_redundant_files(path_info, dir_info, force) + or modified + ) self.state.save_link(path_info) self.state.save(path_info, checksum) + # relink is not modified, assume it as nochange + return added, not added and modified and not relink + def _remove_redundant_files(self, path_info, dir_info, force): existing_files = set(self.walk_files(path_info)) needed_files = { path_info / entry[self.PARAM_RELPATH] for entry in dir_info } - - for path in existing_files - needed_files: + redundant_files = existing_files - needed_files + for path in redundant_files: self.safe_remove(path, force) + return bool(redundant_files) + def checkout( self, path_info, @@ -966,12 +983,14 @@ def checkout( self.path_info, checksum, filter_info ), ) - return failed + if failed: + raise CheckoutError([failed]) + return logger.debug("Checking out '{}' with cache '{}'.", path_info, checksum) - self._checkout( - path_info, checksum, force, progress_callback, relink, filter_info + return self._checkout( + path_info, checksum, force, progress_callback, relink, filter_info, ) def _checkout( @@ -985,8 +1004,9 @@ def _checkout( ): if not self.is_dir_checksum(checksum): return self._checkout_file( - path_info, checksum, force, progress_callback=progress_callback + path_info, checksum, force, progress_callback, relink ) + return self._checkout_dir( path_info, checksum, force, progress_callback, relink, filter_info ) diff --git a/dvc/repo/checkout.py b/dvc/repo/checkout.py index 26f9e18e81..d7cd3547fd 100644 --- a/dvc/repo/checkout.py +++ b/dvc/repo/checkout.py @@ -1,21 +1,31 @@ import logging +import os +from dvc.compat import fspath from dvc.exceptions import CheckoutError from dvc.exceptions import CheckoutErrorSuggestGit from dvc.progress import Tqdm - +from dvc.utils import relpath logger = logging.getLogger(__name__) -def _cleanup_unused_links(repo): +def _get_unused_links(repo): used = [ out.fspath for stage in repo.stages for out in stage.outs if out.scheme == "local" ] - repo.state.remove_unused_links(used) + return repo.state.get_unused_links(used) + + +def _fspath_dir(path, root): + if not os.path.exists(str(path)): + return str(path) + + path = relpath(fspath(path), root) + return os.path.join(path, "") if os.path.isdir(path) else path def get_all_files_numbers(pairs): @@ -34,9 +44,19 @@ def _checkout( ): from dvc.stage import StageFileDoesNotExistError, StageFileBadNameError + unused = [] + stats = { + "added": [], + "deleted": [], + "modified": [], + "failed": [], + } if not targets: targets = [None] - _cleanup_unused_links(self) + unused = _get_unused_links(self) + + stats["deleted"] = [_fspath_dir(u, self.root_dir) for u in unused] + self.state.remove_links(unused) pairs = set() for target in targets: @@ -52,20 +72,23 @@ def _checkout( raise CheckoutErrorSuggestGit(target) from exc total = get_all_files_numbers(pairs) - if total == 0: - logger.info("Nothing to do") - failed = [] with Tqdm( total=total, unit="file", desc="Checkout", disable=total == 0 ) as pbar: for stage, filter_info in pairs: - failed.extend( - stage.checkout( - force=force, - progress_callback=pbar.update_desc, - relink=relink, - filter_info=filter_info, - ) + result = stage.checkout( + force=force, + progress_callback=pbar.update_desc, + relink=relink, + filter_info=filter_info, ) - if failed: - raise CheckoutError(failed) + for data in ["failed", "added", "modified"]: + stats[data] += [ + _fspath_dir(path, self.root_dir) for path in result[data] + ] + + if stats.get("failed"): + raise CheckoutError(stats["failed"], stats) + + del stats["failed"] + return stats diff --git a/dvc/repo/pull.py b/dvc/repo/pull.py index fadd34d3da..993d50be81 100644 --- a/dvc/repo/pull.py +++ b/dvc/repo/pull.py @@ -27,7 +27,9 @@ def pull( with_deps=with_deps, recursive=recursive, ) - self._checkout( + stats = self._checkout( targets=targets, with_deps=with_deps, force=force, recursive=recursive ) - return processed_files_count + + stats["downloaded"] = processed_files_count + return stats diff --git a/dvc/stage.py b/dvc/stage.py index 79e52add9e..0d8a3792c7 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -15,7 +15,7 @@ import dvc.dependency as dependency import dvc.output as output import dvc.prompt as prompt -from dvc.exceptions import DvcException +from dvc.exceptions import CheckoutError, DvcException from dvc.utils import dict_md5 from dvc.utils import fix_env from dvc.utils import relpath @@ -982,18 +982,25 @@ def checkout( relink=False, filter_info=None, ): - failed_checkouts = [] + checkouts = {"failed": [], "added": [], "modified": []} for out in self._filter_outs(filter_info): - failed = out.checkout( - force=force, - tag=self.tag, - progress_callback=progress_callback, - relink=relink, - filter_info=filter_info, - ) - if failed: - failed_checkouts.append(failed) - return failed_checkouts + try: + result = out.checkout( + force=force, + tag=self.tag, + progress_callback=progress_callback, + relink=relink, + filter_info=filter_info, + ) + added, modified = result or (None, None) + if modified: + checkouts["modified"].append(out.path_info) + elif added: + checkouts["added"].append(out.path_info) + except CheckoutError as exc: + checkouts["failed"].extend(exc.target_infos) + + return checkouts @staticmethod def _status(entries): diff --git a/dvc/state.py b/dvc/state.py index 38f77eabaf..9b0ca60ca0 100644 --- a/dvc/state.py +++ b/dvc/state.py @@ -439,7 +439,7 @@ def save_link(self, path_info): ) self._execute(cmd, (relative_path, self._to_sqlite(inode), mtime)) - def remove_unused_links(self, used): + def get_unused_links(self, used): """Removes all saved links except the ones that are used. Args: @@ -453,20 +453,22 @@ def remove_unused_links(self, used): inode = self._from_sqlite(inode) path = os.path.join(self.root_dir, relpath) - if path in used: - continue - - if not os.path.exists(path): + if path in used or not os.path.exists(path): continue actual_inode = get_inode(path) actual_mtime, _ = get_mtime_and_size(path, self.repo.tree) - if inode == actual_inode and mtime == actual_mtime: + if (inode, mtime) == (actual_inode, actual_mtime): logger.debug("Removing '{}' as unused link.", path) - remove(path) unused.append(relpath) + return unused + + def remove_links(self, unused): + for path in unused: + remove(path) + for chunk_unused in to_chunks( unused, chunk_size=SQLITE_MAX_VARIABLES_NUMBER ): diff --git a/tests/func/test_checkout.py b/tests/func/test_checkout.py index 1f49ca4d84..b588e92819 100644 --- a/tests/func/test_checkout.py +++ b/tests/func/test_checkout.py @@ -22,6 +22,8 @@ from dvc.utils.fs import walk_files from dvc.utils.stage import dump_stage_file from dvc.utils.stage import load_stage_file +from dvc.remote import RemoteS3 +from tests.remotes import S3 from tests.basic_env import TestDvc from tests.basic_env import TestDvcGit from tests.func.test_repro import TestRepro @@ -321,7 +323,8 @@ def test(self): stage.outs[0].remove() self.assertFalse(os.path.exists(dname)) - self.dvc.checkout(force=True) + stats = self.dvc.checkout(force=True) + assert stats["added"] == [dname + os.sep] self.assertTrue(os.path.isdir(dname)) self.assertEqual(len(os.listdir(dname)), 0) @@ -337,7 +340,8 @@ def test(self): ) self.assertTrue(stage is not None) - self.dvc.checkout(force=True) + stats = self.dvc.checkout(force=True) + assert not any(stats.values()) class TestCheckoutWithDeps(TestRepro): @@ -421,7 +425,8 @@ def test(self): ret = main(["add", self.FOO]) self.assertEqual(0, ret) - self.dvc.checkout(targets=[self.FOO + ".dvc"], recursive=True) + stats = self.dvc.checkout(targets=[self.FOO + ".dvc"], recursive=True) + assert stats == {"added": [], "modified": [], "deleted": []} class TestCheckoutMovedCacheDirWithSymlinks(TestDvc): @@ -489,7 +494,8 @@ def test_checkout_relink(tmp_dir, dvc, link, link_test_func): dvc.unprotect("dir/data") assert not link_test_func("dir/data") - dvc.checkout(["dir.dvc"], relink=True) + stats = dvc.checkout(["dir.dvc"], relink=True) + assert stats == empty_checkout assert link_test_func("dir/data") @@ -502,7 +508,8 @@ def test_checkout_relink_protected(tmp_dir, dvc, link): dvc.unprotect("foo") assert os.access("foo", os.W_OK) - dvc.checkout(["foo.dvc"], relink=True) + stats = dvc.checkout(["foo.dvc"], relink=True) + assert stats == empty_checkout assert not os.access("foo", os.W_OK) @@ -513,5 +520,230 @@ def test_checkout_relink_protected(tmp_dir, dvc, link): def test_partial_checkout(tmp_dir, dvc, target): tmp_dir.dvc_gen({"dir": {"subdir": {"file": "file"}, "other": "other"}}) shutil.rmtree("dir") - dvc.checkout([target]) + stats = dvc.checkout([target]) + assert stats["added"] == ["dir" + os.sep] assert list(walk_files("dir")) == [os.path.join("dir", "subdir", "file")] + + +empty_checkout = {"added": [], "deleted": [], "modified": []} + + +def test_stats_on_empty_checkout(tmp_dir, dvc, scm): + assert dvc.checkout() == empty_checkout + tmp_dir.dvc_gen( + {"dir": {"subdir": {"file": "file"}, "other": "other"}}, + commit="initial", + ) + assert dvc.checkout() == empty_checkout + + +def test_stats_on_checkout(tmp_dir, dvc, scm): + tmp_dir.dvc_gen( + { + "dir": {"subdir": {"file": "file"}, "other": "other"}, + "foo": "foo", + "bar": "bar", + }, + commit="initial", + ) + scm.checkout("HEAD~") + stats = dvc.checkout() + assert set(stats["deleted"]) == {"dir" + os.sep, "foo", "bar"} + + scm.checkout("-") + stats = dvc.checkout() + assert set(stats["added"]) == {"bar", "dir" + os.sep, "foo"} + + tmp_dir.gen({"lorem": "lorem", "bar": "new bar", "dir2": {"file": "file"}}) + (tmp_dir / "foo").unlink() + scm.repo.git.rm("foo.dvc") + tmp_dir.dvc_add(["bar", "lorem", "dir2"], commit="second") + + scm.checkout("HEAD~") + stats = dvc.checkout() + assert set(stats["modified"]) == {"bar"} + assert set(stats["added"]) == {"foo"} + assert set(stats["deleted"]) == {"lorem", "dir2" + os.sep} + + scm.checkout("-") + stats = dvc.checkout() + assert set(stats["modified"]) == {"bar"} + assert set(stats["added"]) == {"dir2" + os.sep, "lorem"} + assert set(stats["deleted"]) == {"foo"} + + +def test_checkout_stats_on_failure(tmp_dir, dvc, scm): + stages = tmp_dir.dvc_gen( + {"foo": "foo", "dir": {"subdir": {"file": "file"}}, "other": "other"}, + commit="initial", + ) + tmp_dir.dvc_gen({"foo": "foobar", "other": "other other"}, commit="second") + + # corrupt cache + cache = stages[0].outs[0].cache_path + with open(cache, "a") as fd: + fd.write("destroy cache") + + scm.checkout("HEAD~") + with pytest.raises(CheckoutError) as exc: + dvc.checkout(force=True) + + assert exc.value.stats == { + **empty_checkout, + "failed": ["foo"], + "modified": ["other"], + } + + +def test_stats_on_added_file_from_tracked_dir(tmp_dir, dvc, scm): + tmp_dir.dvc_gen( + {"dir": {"subdir": {"file": "file"}, "other": "other"}}, + commit="initial", + ) + + tmp_dir.gen("dir/subdir/newfile", "newfile") + tmp_dir.dvc_add("dir", commit="add newfile") + scm.checkout("HEAD~") + assert dvc.checkout() == {**empty_checkout, "modified": ["dir" + os.sep]} + assert dvc.checkout() == empty_checkout + + scm.checkout("-") + assert dvc.checkout() == {**empty_checkout, "modified": ["dir" + os.sep]} + assert dvc.checkout() == empty_checkout + + +def test_stats_on_updated_file_from_tracked_dir(tmp_dir, dvc, scm): + tmp_dir.dvc_gen( + {"dir": {"subdir": {"file": "file"}, "other": "other"}}, + commit="initial", + ) + + tmp_dir.gen("dir/subdir/file", "what file?") + tmp_dir.dvc_add("dir", commit="update file") + scm.checkout("HEAD~") + assert dvc.checkout() == {**empty_checkout, "modified": ["dir" + os.sep]} + assert dvc.checkout() == empty_checkout + + scm.checkout("-") + assert dvc.checkout() == {**empty_checkout, "modified": ["dir" + os.sep]} + assert dvc.checkout() == empty_checkout + + +def test_stats_on_removed_file_from_tracked_dir(tmp_dir, dvc, scm): + tmp_dir.dvc_gen( + {"dir": {"subdir": {"file": "file"}, "other": "other"}}, + commit="initial", + ) + + (tmp_dir / "dir" / "subdir" / "file").unlink() + tmp_dir.dvc_add("dir", commit="removed file from subdir") + scm.checkout("HEAD~") + assert dvc.checkout() == {**empty_checkout, "modified": ["dir" + os.sep]} + assert dvc.checkout() == empty_checkout + + scm.checkout("-") + assert dvc.checkout() == {**empty_checkout, "modified": ["dir" + os.sep]} + assert dvc.checkout() == empty_checkout + + +def test_stats_on_show_changes_does_not_show_summary( + tmp_dir, dvc, scm, caplog +): + tmp_dir.dvc_gen( + {"dir": {"subdir": {"file": "file"}}, "other": "other"}, + commit="initial", + ) + scm.checkout("HEAD~") + + with caplog.at_level(logging.INFO, logger="dvc"): + caplog.clear() + assert main(["checkout", "--show-changes"]) == 0 + for out in ["D\tdir" + os.sep, "D\tother"]: + assert out in caplog.text + assert "modified" not in caplog.text + assert "deleted" not in caplog.text + assert "added" not in caplog.text + + +def test_stats_does_not_show_changes_by_default(tmp_dir, dvc, scm, caplog): + tmp_dir.dvc_gen( + {"dir": {"subdir": {"file": "file"}}, "other": "other"}, + commit="initial", + ) + scm.checkout("HEAD~") + + with caplog.at_level(logging.INFO, logger="dvc"): + caplog.clear() + assert main(["checkout"]) == 0 + assert "2 deleted" in caplog.text + assert "dir" not in caplog.text + assert "other" not in caplog.text + + +@pytest.mark.parametrize("link", ["hardlink", "symlink", "copy"]) +def test_checkout_with_relink_existing(tmp_dir, dvc, link): + tmp_dir.dvc_gen("foo", "foo") + (tmp_dir / "foo").unlink() + + tmp_dir.dvc_gen("bar", "bar") + dvc.cache.local.cache_types = [link] + + stats = dvc.checkout(relink=True) + assert stats == {**empty_checkout, "added": ["foo"]} + + +def test_checkout_with_deps(tmp_dir, dvc): + tmp_dir.dvc_gen({"foo": "foo"}) + dvc.run( + fname="copy_file.dvc", cmd="echo foo > bar", outs=["bar"], deps=["foo"] + ) + + (tmp_dir / "bar").unlink() + (tmp_dir / "foo").unlink() + + stats = dvc.checkout(["copy_file.dvc"], with_deps=False) + assert stats == {**empty_checkout, "added": ["bar"]} + + (tmp_dir / "bar").unlink() + stats = dvc.checkout(["copy_file.dvc"], with_deps=True) + assert set(stats["added"]) == {"foo", "bar"} + + +def test_checkout_recursive(tmp_dir, dvc): + tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) + dvc.add("dir", recursive=True) + + (tmp_dir / "dir" / "foo").unlink() + (tmp_dir / "dir" / "bar").unlink() + + stats = dvc.checkout(["dir"], recursive=True) + assert set(stats["added"]) == { + os.path.join("dir", "foo"), + os.path.join("dir", "bar"), + } + + +@pytest.mark.skipif( + not S3.should_test(), reason="Only run with S3 credentials" +) +def test_checkout_for_external_outputs(tmp_dir, dvc): + dvc.cache.s3 = RemoteS3(dvc, {"url": S3.get_url()}) + + remote = RemoteS3(dvc, {"url": S3.get_url()}) + file_path = remote.path_info / "foo" + remote.s3.put_object( + Bucket=remote.path_info.bucket, Key=file_path.path, Body="foo" + ) + + dvc.add(str(remote.path_info / "foo")) + + remote.remove(file_path) + stats = dvc.checkout(force=True) + assert stats == {**empty_checkout, "added": [str(file_path)]} + assert remote.exists(file_path) + + remote.s3.put_object( + Bucket=remote.path_info.bucket, Key=file_path.path, Body="foo\nfoo" + ) + stats = dvc.checkout(force=True) + assert stats == {**empty_checkout, "modified": [str(file_path)]} diff --git a/tests/func/test_state.py b/tests/func/test_state.py index ed33ddbd54..36618f3025 100644 --- a/tests/func/test_state.py +++ b/tests/func/test_state.py @@ -1,4 +1,5 @@ import mock +import os from dvc.path_info import PathInfo from dvc.state import State @@ -69,16 +70,33 @@ def test_get_state_record_for_inode(get_inode_mock, tmp_dir, dvc): assert ret is not None -def test_remove_unused_links(tmp_dir, dvc): - assert len(tmp_dir.dvc_gen("foo", "foo_content")) == 1 - assert len(tmp_dir.dvc_gen("bar", "bar_content")) == 1 +def test_remove_links(tmp_dir, dvc): + tmp_dir.dvc_gen({"foo": "foo_content", "bar": "bar_content"}) - cmd_count_links = "SELECT count(*) FROM {}".format(State.LINK_STATE_TABLE) with dvc.state: + cmd_count_links = "SELECT count(*) FROM {}".format( + State.LINK_STATE_TABLE + ) result = dvc.state._execute(cmd_count_links).fetchone()[0] assert result == 2 - dvc.state.remove_unused_links([]) + dvc.state.remove_links(["foo", "bar"]) result = dvc.state._execute(cmd_count_links).fetchone()[0] assert result == 0 + + +def test_get_unused_links(tmp_dir, dvc): + tmp_dir.dvc_gen({"foo": "foo_content", "bar": "bar_content"}) + + with dvc.state: + links = [os.path.join(dvc.root_dir, link) for link in ["foo", "bar"]] + assert set(dvc.state.get_unused_links([])) == {"foo", "bar"} + assert set(dvc.state.get_unused_links(links[:1])) == {"bar"} + assert set(dvc.state.get_unused_links(links)) == set() + assert set( + dvc.state.get_unused_links( + used=links[:1] + + [os.path.join(dvc.root_dir, "not-existing-file")] + ) + ) == {"bar"} diff --git a/tests/unit/command/test_checkout.py b/tests/unit/command/test_checkout.py index 21fc7920e4..8e0a8f5f19 100644 --- a/tests/unit/command/test_checkout.py +++ b/tests/unit/command/test_checkout.py @@ -1,5 +1,7 @@ +import logging + from dvc.cli import parse_args -from dvc.command.checkout import CmdCheckout +from dvc.command.checkout import CmdCheckout, log_summary, log_changes def test_checkout(tmp_dir, dvc, mocker): @@ -19,3 +21,56 @@ def test_checkout(tmp_dir, dvc, mocker): relink=True, with_deps=True, ) + + +def test_log_summary(caplog): + stats = { + "added": ["file1", "file2", "file3"], + "deleted": ["file4", "file5"], + "modified": ["file6", "file7"], + } + + def _assert_output(stats, expected_text): + with caplog.at_level(logging.INFO, logger="dvc"): + caplog.clear() + log_summary(stats) + assert expected_text in caplog.text + + _assert_output(stats, "3 added, 2 deleted and 2 modified") + + del stats["deleted"][1] + _assert_output(stats, "3 added, 1 deleted and 2 modified") + + del stats["deleted"][0] + _assert_output(stats, "3 added and 2 modified") + + del stats["modified"] + _assert_output(stats, "3 added") + + _assert_output({}, "") + + +def test_log_changes(caplog): + stats = { + "added": ["file1", "dir1/"], + "deleted": ["dir2/"], + "modified": ["file2"], + } + + from itertools import zip_longest + + def _assert_output(stats, expected_outs): + with caplog.at_level(logging.INFO, logger="dvc"): + caplog.clear() + log_changes(stats) + actual_output = caplog.text.splitlines() + for out, line in zip_longest(expected_outs, actual_output): + assert out in line + + _assert_output(stats, ["M\tfile2", "A\tfile1", "A\tdir1/", "D\tdir2/"]) + + del stats["deleted"][0] + _assert_output(stats, ["M\tfile2", "A\tfile1", "A\tdir1/"]) + + del stats["modified"] + _assert_output(stats, ["A\tfile1", "A\tdir1/"]) From f2cf62ef5afb0f92e7ab7debb93393e242df5a57 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Wed, 11 Mar 2020 16:01:55 +0545 Subject: [PATCH 2/6] fix tests --- tests/func/test_checkout.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/func/test_checkout.py b/tests/func/test_checkout.py index b588e92819..7a67e3a6ae 100644 --- a/tests/func/test_checkout.py +++ b/tests/func/test_checkout.py @@ -573,14 +573,15 @@ def test_stats_on_checkout(tmp_dir, dvc, scm): def test_checkout_stats_on_failure(tmp_dir, dvc, scm): - stages = tmp_dir.dvc_gen( + tmp_dir.dvc_gen( {"foo": "foo", "dir": {"subdir": {"file": "file"}}, "other": "other"}, commit="initial", ) + stage = Stage.load(dvc, "foo.dvc") tmp_dir.dvc_gen({"foo": "foobar", "other": "other other"}, commit="second") # corrupt cache - cache = stages[0].outs[0].cache_path + cache = stage.outs[0].cache_path with open(cache, "a") as fd: fd.write("destroy cache") From 104b4507116468c8e758f45ab5ab23a9741da87f Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Thu, 12 Mar 2020 14:02:02 +0545 Subject: [PATCH 3/6] checkout: make show changes default and introduce summary flag to show summary --- dvc/command/checkout.py | 10 +++++----- tests/func/test_checkout.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dvc/command/checkout.py b/dvc/command/checkout.py index cbf8b29b6d..5dfed810ae 100644 --- a/dvc/command/checkout.py +++ b/dvc/command/checkout.py @@ -82,10 +82,10 @@ def run(self): exc = _exc stats = exc.stats - if self.args.show_changes: - log_changes(stats) - else: + if self.args.summary: log_summary(stats) + else: + log_changes(stats) if exc: raise exc @@ -103,10 +103,10 @@ def add_parser(subparsers, parent_parser): formatter_class=argparse.RawDescriptionHelpFormatter, ) checkout_parser.add_argument( - "--show-changes", + "--summary", action="store_true", default=False, - help="Show list of changes", + help="Show summary of the changes", ) checkout_parser.add_argument( "-d", diff --git a/tests/func/test_checkout.py b/tests/func/test_checkout.py index 7a67e3a6ae..ee06dc2416 100644 --- a/tests/func/test_checkout.py +++ b/tests/func/test_checkout.py @@ -658,7 +658,7 @@ def test_stats_on_show_changes_does_not_show_summary( with caplog.at_level(logging.INFO, logger="dvc"): caplog.clear() - assert main(["checkout", "--show-changes"]) == 0 + assert main(["checkout"]) == 0 for out in ["D\tdir" + os.sep, "D\tother"]: assert out in caplog.text assert "modified" not in caplog.text @@ -675,7 +675,7 @@ def test_stats_does_not_show_changes_by_default(tmp_dir, dvc, scm, caplog): with caplog.at_level(logging.INFO, logger="dvc"): caplog.clear() - assert main(["checkout"]) == 0 + assert main(["checkout", "--summary"]) == 0 assert "2 deleted" in caplog.text assert "dir" not in caplog.text assert "other" not in caplog.text From 5bcd1bf80c9d245fe8e6be456e9bb4978adbfd7f Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Thu, 12 Mar 2020 15:10:37 +0545 Subject: [PATCH 4/6] address suggestions --- dvc/command/checkout.py | 26 +++++++++----------------- dvc/external_repo.py | 6 +----- dvc/remote/base.py | 1 - dvc/repo/checkout.py | 4 ++-- 4 files changed, 12 insertions(+), 25 deletions(-) diff --git a/dvc/command/checkout.py b/dvc/command/checkout.py index 5dfed810ae..2d908f0ede 100644 --- a/dvc/command/checkout.py +++ b/dvc/command/checkout.py @@ -10,8 +10,8 @@ logger = logging.getLogger(__name__) -def _human_join(words=None): - words = list(words or []) +def _human_join(words): + words = list(words) if not words: return "" @@ -25,21 +25,13 @@ def _human_join(words=None): def log_summary(stats): - message = [ - ("added", "{added} added"), - ("deleted", "{deleted} deleted"), - ("modified", "{modified} modified"), - ] - summary = { - stat: len(stats[stat]) for stat, num in message if stats.get(stat) - } - if not summary: - return - - template = _human_join( - fragment for stat, fragment in message if stat in summary + states = ("added", "deleted", "modified") + summary = ( + "{} {}".format(len(stats[state]), state) + for state in states + if stats.get(state) ) - logger.info(template.format_map(summary)) + logger.info(_human_join(summary)) def log_changes(stats): @@ -59,7 +51,7 @@ def log_changes(stats): logger.info( "{color}{state}{nc}{spacing}{entry}".format( color=color, - state=state[0].capitalize(), + state=state[0].upper(), nc=colorama.Fore.RESET, spacing="\t", entry=entry, diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 2c2154f71d..61257ae28d 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -106,18 +106,14 @@ def _pull_cached(self, out, path_info, dest): if out.changed_cache(filter_info=src): self.cloud.pull(out.get_used_cache(filter_info=src)) - failed = False try: out.checkout(filter_info=src) except CheckoutError: - failed = True + raise FileNotFoundError move(src, dest) remove(tmp) - if failed: - raise FileNotFoundError - @wrap_with(threading.Lock()) def _set_cache_dir(self): try: diff --git a/dvc/remote/base.py b/dvc/remote/base.py index cb6be34bcd..d9cf39ae12 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -868,7 +868,6 @@ def _checkout_file( if progress_callback: progress_callback(str(path_info)) - # relink is not modified return added, modified and not relink def makedirs(self, path_info): diff --git a/dvc/repo/checkout.py b/dvc/repo/checkout.py index d7cd3547fd..3a239bf1cb 100644 --- a/dvc/repo/checkout.py +++ b/dvc/repo/checkout.py @@ -83,9 +83,9 @@ def _checkout( filter_info=filter_info, ) for data in ["failed", "added", "modified"]: - stats[data] += [ + stats[data].extend( _fspath_dir(path, self.root_dir) for path in result[data] - ] + ) if stats.get("failed"): raise CheckoutError(stats["failed"], stats) From 5fb1c902486555cafeffae3b9a54b7a73ef0aaa0 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Thu, 12 Mar 2020 15:16:01 +0545 Subject: [PATCH 5/6] exceptions: remove extra spacing on erorr --- dvc/exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/exceptions.py b/dvc/exceptions.py index 18d02e1e9f..2a1b1cc5d1 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -261,7 +261,7 @@ def __init__(self, target_infos, stats=None): self.stats = stats targets = [str(t) for t in target_infos] m = ( - "Checkout failed for following targets:\n {}\nDid you " + "Checkout failed for following targets:\n{}\nDid you " "forget to fetch?".format("\n".join(targets)) ) super().__init__(m) From b80a44b8035af5153e81c55d348c1d1a9eb9ba5f Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Sun, 15 Mar 2020 15:35:44 +0545 Subject: [PATCH 6/6] Update dvc/command/checkout.py Co-Authored-By: Jorge Orpinel --- dvc/command/checkout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/command/checkout.py b/dvc/command/checkout.py index 2d908f0ede..323e349977 100644 --- a/dvc/command/checkout.py +++ b/dvc/command/checkout.py @@ -98,7 +98,7 @@ def add_parser(subparsers, parent_parser): "--summary", action="store_true", default=False, - help="Show summary of the changes", + help="Show summary of the changes.", ) checkout_parser.add_argument( "-d",