Skip to content

Commit

Permalink
Merge pull request #2837 from danihodovic/feat/2515
Browse files Browse the repository at this point in the history
get: copy/download files tracked by Git
  • Loading branch information
efiop authored Dec 9, 2019
2 parents 237704e + 29b34ba commit 26a9702
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 10 deletions.
11 changes: 8 additions & 3 deletions dvc/command/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def run(self):


def add_parser(subparsers, parent_parser):
GET_HELP = "Download data from DVC repository."
GET_HELP = "Download/copy files or directories from DVC repository."
get_parser = subparsers.add_parser(
"get",
parents=[parent_parser],
Expand All @@ -44,9 +44,14 @@ def add_parser(subparsers, parent_parser):
get_parser.add_argument(
"url", help="URL of Git repository with DVC project to download from."
)
get_parser.add_argument("path", help="Path to data within DVC repository.")
get_parser.add_argument(
"-o", "--out", nargs="?", help="Destination path to put data to."
"path", help="Path to a file or directory within a DVC repository."
)
get_parser.add_argument(
"-o",
"--out",
nargs="?",
help="Destination path to copy/download files to.",
)
get_parser.add_argument(
"--rev", nargs="?", help="DVC repository git revision."
Expand Down
6 changes: 6 additions & 0 deletions dvc/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,12 @@ def __init__(self, path, cause=None):
)


class PathOutsideRepoError(DvcException):
def __init__(self, path, repo):
msg = "The path '{}' does not exist in the target repository '{}'."
super(PathOutsideRepoError, self).__init__(msg.format(path, repo))


class DvcIgnoreInCollectedDirError(DvcException):
def __init__(self, ignore_dirname):
super(DvcIgnoreInCollectedDirError, self).__init__(
Expand Down
46 changes: 39 additions & 7 deletions dvc/repo/get.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,39 @@
import logging
import os
import shutil

import shortuuid

from dvc.exceptions import GetDVCFileError
from dvc.exceptions import NotDvcRepoError
from dvc.exceptions import OutputNotFoundError
from dvc.exceptions import UrlNotDvcRepoError
from dvc.exceptions import PathOutsideRepoError
from dvc.external_repo import external_repo
from dvc.path_info import PathInfo
from dvc.stage import Stage
from dvc.state import StateNoop
from dvc.utils import resolve_output
from dvc.utils.fs import remove
from dvc.utils.compat import FileNotFoundError

logger = logging.getLogger(__name__)


def _copy_git_file(repo, src, dst, repo_url):
src_full_path = os.path.join(repo.root_dir, src)
dst_full_path = os.path.abspath(dst)

if os.path.isdir(src_full_path):
shutil.copytree(src_full_path, dst_full_path)
return

try:
shutil.copy2(src_full_path, dst_full_path)
except FileNotFoundError:
raise PathOutsideRepoError(src, repo_url)


@staticmethod
def get(url, path, out=None, rev=None):
out = resolve_output(path, out)
Expand Down Expand Up @@ -49,16 +66,31 @@ def get(url, path, out=None, rev=None):
# the same cache file might be used a few times in a directory.
repo.cache.local.cache_types = ["reflink", "hardlink", "copy"]

o = repo.find_out_by_relpath(path)
output = None
output_error = None

try:
output = repo.find_out_by_relpath(path)
except OutputNotFoundError as ex:
output_error = ex

is_git_file = output_error and not os.path.isabs(path)
is_not_cached = output and not output.use_cache

if is_git_file or is_not_cached:
_copy_git_file(repo, path, out, url)
return

if output_error:
raise OutputNotFoundError(path)

with repo.state:
repo.cloud.pull(o.get_used_cache())
o.path_info = PathInfo(os.path.abspath(out))
with o.repo.state:
o.checkout()
repo.cloud.pull(output.get_used_cache())
output.path_info = PathInfo(os.path.abspath(out))
with output.repo.state:
output.checkout()

except NotDvcRepoError:
raise UrlNotDvcRepoError(url)
except OutputNotFoundError:
raise OutputNotFoundError(path)
finally:
remove(tmp_dir)
78 changes: 78 additions & 0 deletions tests/func/test_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,13 @@
from dvc.config import Config
from dvc.exceptions import GetDVCFileError
from dvc.exceptions import UrlNotDvcRepoError
from dvc.exceptions import OutputNotFoundError
from dvc.exceptions import PathOutsideRepoError
from dvc.repo import Repo
from dvc.system import System
from dvc.utils import makedirs
from dvc.utils.compat import fspath
from dvc.utils import fspath_py35
from tests.utils import trees_equal


Expand All @@ -38,6 +41,36 @@ def test_get_repo_dir(erepo):
trees_equal(src, dst)


def test_get_regular_file(erepo):
src = "some_file"
dst = "some_file_imported"

src_path = os.path.join(erepo.root_dir, src)
erepo.create(src_path, "hello")
erepo.dvc.scm.add([src_path])
erepo.dvc.scm.commit("add a regular file")
Repo.get(erepo.root_dir, src, dst)

assert os.path.exists(dst)
assert os.path.isfile(dst)
assert filecmp.cmp(src_path, dst, shallow=False)


def test_get_regular_dir(erepo):
src = "some_directory"
dst = "some_directory_imported"

src_file_path = os.path.join(erepo.root_dir, src, "file.txt")
erepo.create(src_file_path, "hello")
erepo.dvc.scm.add([src_file_path])
erepo.dvc.scm.commit("add a regular dir")
Repo.get(erepo.root_dir, src, dst)

assert os.path.exists(dst)
assert os.path.isdir(dst)
trees_equal(os.path.join(erepo.root_dir, src), dst)


def test_cache_type_is_properly_overridden(erepo):
erepo.dvc.config.set(
Config.SECTION_CACHE, Config.SECTION_CACHE_TYPE, "symlink"
Expand Down Expand Up @@ -77,6 +110,51 @@ def test_get_a_dvc_file(erepo):
Repo.get(erepo.root_dir, "some_file.dvc")


# https://github.com/iterative/dvc/pull/2837#discussion_r352123053
def test_get_full_dvc_path(erepo):
external_data_dir = erepo.mkdtemp()
external_data = os.path.join(external_data_dir, "ext_data")
with open(external_data, "w+") as fobj:
fobj.write("ext_data")

cur_dir = os.getcwd()
os.chdir(erepo.root_dir)
erepo.dvc.add(external_data)
erepo.dvc.scm.add(["ext_data.dvc"])
erepo.dvc.scm.commit("add external data")
os.chdir(cur_dir)

Repo.get(erepo.root_dir, external_data, "ext_data_imported")
assert os.path.isfile("ext_data_imported")
assert filecmp.cmp(external_data, "ext_data_imported", shallow=False)


def test_non_cached_output(tmp_path, erepo):
os.chdir(erepo.root_dir)
erepo.dvc.run(
outs_no_cache=["non_cached_file"], cmd="echo hello > non_cached_file"
)
erepo.dvc.scm.add(["non_cached_file", "non_cached_file.dvc"])
erepo.dvc.scm.commit("add non-cached output")
os.chdir(fspath_py35(tmp_path))
Repo.get(erepo.root_dir, "non_cached_file")

src = os.path.join(erepo.root_dir, "non_cached_file")
assert os.path.isfile("non_cached_file")
assert filecmp.cmp(src, "non_cached_file", shallow=False)


# https://github.com/iterative/dvc/pull/2837#discussion_r352123053
def test_fails_with_files_outside_repo(erepo):
with pytest.raises(OutputNotFoundError):
Repo.get(erepo.root_dir, "/root/")


def test_fails_with_non_existing_files(erepo):
with pytest.raises(PathOutsideRepoError):
Repo.get(erepo.root_dir, "file_does_not_exist")


@pytest.mark.parametrize("dname", [".", "dir", "dir/subdir"])
def test_get_to_dir(dname, erepo):
src = erepo.FOO
Expand Down

0 comments on commit 26a9702

Please sign in to comment.