Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

checkout: show summary by default and introduce --show-changes flag #3401

Merged
merged 6 commits into from
Mar 16, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 89 additions & 7 deletions dvc/command/checkout.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,94 @@
import argparse
import logging

import colorama

from dvc.command.base import append_doc_link
from dvc.command.base import CmdBase
from dvc.exceptions import CheckoutError

logger = logging.getLogger(__name__)


def _human_join(words=None):
skshetry marked this conversation as resolved.
Show resolved Hide resolved
words = list(words or [])
if not words:
return ""

return (
"{before} and {after}".format(
before=", ".join(words[:-1]), after=words[-1],
)
if len(words) > 1
else words[0]
)


def log_summary(stats):
message = [
("added", "{added} added"),
("deleted", "{deleted} deleted"),
("modified", "{modified} modified"),
]
summary = {
stat: len(stats[stat]) for stat, num in message if stats.get(stat)
}
if not summary:
return

template = _human_join(
fragment for stat, fragment in message if stat in summary
)
logger.info(template.format_map(summary))
skshetry marked this conversation as resolved.
Show resolved Hide resolved


def log_changes(stats):
colors = [
("modified", colorama.Fore.YELLOW,),
("added", colorama.Fore.GREEN),
("deleted", colorama.Fore.RED,),
]

for state, color in colors:
entries = stats.get(state)

if not entries:
continue

for entry in entries:
logger.info(
"{color}{state}{nc}{spacing}{entry}".format(
color=color,
state=state[0].capitalize(),
skshetry marked this conversation as resolved.
Show resolved Hide resolved
nc=colorama.Fore.RESET,
spacing="\t",
entry=entry,
)
)


class CmdCheckout(CmdBase):
def run(self):
self.repo.checkout(
targets=self.args.targets,
with_deps=self.args.with_deps,
force=self.args.force,
relink=self.args.relink,
recursive=self.args.recursive,
)
stats, exc = None, None
try:
stats = self.repo.checkout(
targets=self.args.targets,
with_deps=self.args.with_deps,
force=self.args.force,
relink=self.args.relink,
recursive=self.args.recursive,
)
except CheckoutError as _exc:
exc = _exc
skshetry marked this conversation as resolved.
Show resolved Hide resolved
stats = exc.stats

if self.args.show_changes:
log_changes(stats)
else:
log_summary(stats)

if exc:
raise exc
return 0


Expand All @@ -26,6 +102,12 @@ def add_parser(subparsers, parent_parser):
help=CHECKOUT_HELP,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
checkout_parser.add_argument(
"--show-changes",
action="store_true",
default=False,
help="Show list of changes",
)
checkout_parser.add_argument(
"-d",
"--with-deps",
Expand Down
12 changes: 8 additions & 4 deletions dvc/command/data_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

from dvc.command.base import append_doc_link
from dvc.command.base import CmdBase
from dvc.exceptions import DvcException
from dvc.command.checkout import log_summary
from dvc.exceptions import DvcException, CheckoutError


logger = logging.getLogger(__name__)
Expand All @@ -19,7 +20,7 @@ def check_up_to_date(cls, processed_files_count):
class CmdDataPull(CmdDataBase):
def run(self):
try:
processed_files_count = self.repo.pull(
stats = self.repo.pull(
targets=self.args.targets,
jobs=self.args.jobs,
remote=self.args.remote,
Expand All @@ -29,10 +30,13 @@ def run(self):
force=self.args.force,
recursive=self.args.recursive,
)
except DvcException:
log_summary(stats)
except (CheckoutError, DvcException) as exc:
log_summary(getattr(exc, "stats", {}))
logger.exception("failed to pull data from the cloud")
return 1
self.check_up_to_date(processed_files_count)

self.check_up_to_date(stats["downloaded"])
return 0


Expand Down
4 changes: 3 additions & 1 deletion dvc/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,9 @@ def __init__(self, amount):


class CheckoutError(DvcException):
def __init__(self, target_infos):
def __init__(self, target_infos, stats=None):
self.target_infos = target_infos
self.stats = stats
targets = [str(t) for t in target_infos]
m = (
"Checkout failed for following targets:\n {}\nDid you "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry, really minor request (feel free to nor change this), came out as I was reviewing:

ERROR: unexpected error - Checkout failed for following targets:
 data/prepared
data/features
data/data.xml

there is an extra space that is not needed before data/prepared. I think the change is \n {}\n -> \n{}\n

Expand Down
8 changes: 6 additions & 2 deletions dvc/external_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from dvc.compat import fspath
from dvc.repo import Repo
from dvc.config import NoRemoteError, NotDvcRepoError
from dvc.exceptions import NoRemoteInExternalRepoError
from dvc.exceptions import NoRemoteInExternalRepoError, CheckoutError
from dvc.exceptions import OutputNotFoundError, NoOutputInExternalRepoError
from dvc.exceptions import FileMissingError, PathMissingError
from dvc.utils.fs import remove, fs_copy, move
Expand Down Expand Up @@ -106,7 +106,11 @@ def _pull_cached(self, out, path_info, dest):
if out.changed_cache(filter_info=src):
self.cloud.pull(out.get_used_cache(filter_info=src))

failed = out.checkout(filter_info=src)
failed = False
try:
out.checkout(filter_info=src)
except CheckoutError:
failed = True
skshetry marked this conversation as resolved.
Show resolved Hide resolved

move(src, dest)
remove(tmp)
Expand Down
36 changes: 28 additions & 8 deletions dvc/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import dvc.prompt as prompt
from dvc.exceptions import (
CheckoutError,
DvcException,
ConfirmRemoveError,
DvcIgnoreInCollectedDirError,
Expand Down Expand Up @@ -851,20 +852,25 @@ def safe_remove(self, path_info, force=False):
self.remove(path_info)

def _checkout_file(
self, path_info, checksum, force, progress_callback=None
self, path_info, checksum, force, progress_callback=None, relink=False
):
"""The file is changed we need to checkout a new copy"""
added, modified = True, False
cache_info = self.checksum_to_path_info(checksum)
if self.exists(path_info):
logger.debug("data '{}' will be replaced.", path_info)
self.safe_remove(path_info, force=force)
added, modified = False, True

self.link(cache_info, path_info)
self.state.save_link(path_info)
self.state.save(path_info, checksum)
if progress_callback:
progress_callback(str(path_info))

# relink is not modified
skshetry marked this conversation as resolved.
Show resolved Hide resolved
return added, modified and not relink

def makedirs(self, path_info):
"""Optional: Implement only if the remote needs to create
directories before copying/linking/moving data
Expand All @@ -879,9 +885,11 @@ def _checkout_dir(
relink=False,
filter_info=None,
):
added, modified = False, False
# Create dir separately so that dir is created
# even if there are no files in it
if not self.exists(path_info):
added = True
self.makedirs(path_info)

dir_info = self.get_dir_cache(checksum)
Expand All @@ -899,27 +907,36 @@ def _checkout_dir(

entry_checksum_info = {self.PARAM_CHECKSUM: entry_checksum}
if relink or self.changed(entry_info, entry_checksum_info):
modified = True
self.safe_remove(entry_info, force=force)
self.link(entry_cache_info, entry_info)
self.state.save(entry_info, entry_checksum)
if progress_callback:
progress_callback(str(entry_info))

self._remove_redundant_files(path_info, dir_info, force)
modified = (
self._remove_redundant_files(path_info, dir_info, force)
or modified
)

self.state.save_link(path_info)
self.state.save(path_info, checksum)

# relink is not modified, assume it as nochange
return added, not added and modified and not relink

skshetry marked this conversation as resolved.
Show resolved Hide resolved
def _remove_redundant_files(self, path_info, dir_info, force):
existing_files = set(self.walk_files(path_info))

needed_files = {
path_info / entry[self.PARAM_RELPATH] for entry in dir_info
}

for path in existing_files - needed_files:
redundant_files = existing_files - needed_files
for path in redundant_files:
self.safe_remove(path, force)

return bool(redundant_files)

def checkout(
self,
path_info,
Expand Down Expand Up @@ -966,12 +983,14 @@ def checkout(
self.path_info, checksum, filter_info
),
)
return failed
if failed:
raise CheckoutError([failed])
skshetry marked this conversation as resolved.
Show resolved Hide resolved
return

logger.debug("Checking out '{}' with cache '{}'.", path_info, checksum)

self._checkout(
path_info, checksum, force, progress_callback, relink, filter_info
return self._checkout(
path_info, checksum, force, progress_callback, relink, filter_info,
)

def _checkout(
Expand All @@ -985,8 +1004,9 @@ def _checkout(
):
if not self.is_dir_checksum(checksum):
return self._checkout_file(
path_info, checksum, force, progress_callback=progress_callback
path_info, checksum, force, progress_callback, relink
)

return self._checkout_dir(
path_info, checksum, force, progress_callback, relink, filter_info
)
Expand Down
55 changes: 39 additions & 16 deletions dvc/repo/checkout.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,31 @@
import logging
import os

from dvc.compat import fspath
from dvc.exceptions import CheckoutError
from dvc.exceptions import CheckoutErrorSuggestGit
from dvc.progress import Tqdm

from dvc.utils import relpath

logger = logging.getLogger(__name__)


def _cleanup_unused_links(repo):
def _get_unused_links(repo):
used = [
out.fspath
for stage in repo.stages
for out in stage.outs
if out.scheme == "local"
]
repo.state.remove_unused_links(used)
return repo.state.get_unused_links(used)
skshetry marked this conversation as resolved.
Show resolved Hide resolved


def _fspath_dir(path, root):
if not os.path.exists(str(path)):
return str(path)

path = relpath(fspath(path), root)
return os.path.join(path, "") if os.path.isdir(path) else path


def get_all_files_numbers(pairs):
Expand All @@ -34,9 +44,19 @@ def _checkout(
):
from dvc.stage import StageFileDoesNotExistError, StageFileBadNameError

unused = []
stats = {
"added": [],
"deleted": [],
"modified": [],
"failed": [],
}
if not targets:
targets = [None]
_cleanup_unused_links(self)
unused = _get_unused_links(self)

stats["deleted"] = [_fspath_dir(u, self.root_dir) for u in unused]
self.state.remove_links(unused)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this not under if? Looks fragile this way, we shouldn't call remove_links() at all if a particular target is specified.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's empty if there's no argument.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I need to check if something is a dir before deleting.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checkout in dvc:

  • checks out particular thing(s) if that things (target) is specified
  • if no target is specified it checks out everything and removes anything previously checked out and having no references now.

I.e. if there is target we don't try to change the rest of the workspace, so .remove_links() should not be called. The logic of your code is different.

Copy link
Member Author

@skshetry skshetry Mar 13, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If targets are specified, unused will be empty and hence, nothing will get removed. If your concern is regarding calling .remove_links() at all, I don't think there's any reason to be defensive.


pairs = set()
for target in targets:
Expand All @@ -52,20 +72,23 @@ def _checkout(
raise CheckoutErrorSuggestGit(target) from exc

total = get_all_files_numbers(pairs)
if total == 0:
logger.info("Nothing to do")
failed = []
with Tqdm(
total=total, unit="file", desc="Checkout", disable=total == 0
) as pbar:
for stage, filter_info in pairs:
failed.extend(
stage.checkout(
force=force,
progress_callback=pbar.update_desc,
relink=relink,
filter_info=filter_info,
)
result = stage.checkout(
force=force,
progress_callback=pbar.update_desc,
relink=relink,
filter_info=filter_info,
)
if failed:
raise CheckoutError(failed)
for data in ["failed", "added", "modified"]:
stats[data] += [
skshetry marked this conversation as resolved.
Show resolved Hide resolved
_fspath_dir(path, self.root_dir) for path in result[data]
]

if stats.get("failed"):
raise CheckoutError(stats["failed"], stats)

del stats["failed"]
return stats
Loading