From 033ef88a4eab29a85869b8170b3f03d4c3030392 Mon Sep 17 00:00:00 2001 From: Alexander Schepanovski Date: Sat, 18 Jan 2020 22:09:59 +0700 Subject: [PATCH 1/5] external: refactor external_repo() and its users Some things are fixed along the way: - no cache shared between incompatible funcs - some incosistent exceptions are now consistent - import updates for git files now works both for dvc and git repos - "auto-generated-upstream already exists" fixed - dvc.api.open()/read() works with git repos now Fixes #3124, #2976. Closes #3172. --- dvc/api.py | 29 +++-- dvc/dependency/repo.py | 120 ++++-------------- dvc/exceptions.py | 1 + dvc/external_repo.py | 210 ++++++++++++++++++------------- dvc/remote/config.py | 4 + dvc/remote/local.py | 6 +- dvc/repo/__init__.py | 14 +-- dvc/repo/fetch.py | 9 +- dvc/repo/get.py | 51 ++------ tests/func/test_external_repo.py | 19 +-- tests/func/test_get.py | 3 +- tests/func/test_import.py | 8 +- tests/func/test_status.py | 10 +- tests/func/test_update.py | 4 +- 14 files changed, 200 insertions(+), 288 deletions(-) diff --git a/dvc/api.py b/dvc/api.py index 5c3401bbe5..9f3912745f 100644 --- a/dvc/api.py +++ b/dvc/api.py @@ -43,7 +43,7 @@ class SummonError(DvcException): pass -class UrlNotDvcRepoError(DvcException): +class UrlNotDvcRepoError(NotDvcRepoError): """Thrown if given url is not a DVC repository. Args: @@ -60,14 +60,11 @@ def get_url(path, repo=None, rev=None, remote=None): `repo`. NOTE: There is no guarantee that the file actually exists in that location. """ - try: - with _make_repo(repo, rev=rev) as _repo: - abspath = os.path.join(_repo.root_dir, path) - out, = _repo.find_outs_by_path(abspath) - remote_obj = _repo.cloud.get_remote(remote) - return str(remote_obj.checksum_to_path_info(out.checksum)) - except NotDvcRepoError: - raise UrlNotDvcRepoError(repo) + with _make_repo(repo, rev=rev) as _repo: + _require_dvc(_repo) + out = _repo.find_out_by_relpath(path) + remote_obj = _repo.cloud.get_remote(remote) + return str(remote_obj.checksum_to_path_info(out.checksum)) def open(path, repo=None, rev=None, remote=None, mode="r", encoding=None): @@ -96,9 +93,8 @@ def __getattr__(self, name): def _open(path, repo=None, rev=None, remote=None, mode="r", encoding=None): with _make_repo(repo, rev=rev) as _repo: - abspath = os.path.join(_repo.root_dir, path) - with _repo.open( - abspath, remote=remote, mode=mode, encoding=encoding + with _repo.open_by_relpath( + path, remote=remote, mode=mode, encoding=encoding ) as fd: yield fd @@ -114,9 +110,6 @@ def read(path, repo=None, rev=None, remote=None, mode="r", encoding=None): @contextmanager def _make_repo(repo_url, rev=None): if not repo_url or os.path.exists(repo_url): - assert ( - rev is None - ), "Git revisions are not supported for local DVC projects." yield Repo(repo_url) else: with external_repo(url=repo_url, rev=rev) as repo: @@ -149,6 +142,7 @@ def prepare_summon(name, repo=None, rev=None, summon_file="dvcsummon.yaml"): named object specification and resolved paths to deps. """ with _make_repo(repo, rev=rev) as _repo: + _require_dvc(_repo) try: path = os.path.join(_repo.root_dir, summon_file) obj = _get_object_spec(name, path) @@ -242,3 +236,8 @@ def _import_string(import_name): else: return importlib.import_module(import_name) return getattr(importlib.import_module(module), obj) + + +def _require_dvc(repo): + if not isinstance(repo, Repo): + raise UrlNotDvcRepoError(repo.url) diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index 69ba25dad7..1626cf44ac 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -1,20 +1,8 @@ -import copy import os -from contextlib import contextmanager - -from funcy import merge from .local import DependencyLOCAL -from dvc.external_repo import cached_clone -from dvc.external_repo import external_repo -from dvc.exceptions import NotDvcRepoError from dvc.exceptions import OutputNotFoundError -from dvc.exceptions import NoOutputInExternalRepoError -from dvc.exceptions import PathMissingError -from dvc.config import NoRemoteError -from dvc.utils.fs import fs_copy from dvc.path_info import PathInfo -from dvc.scm import SCM class DependencyREPO(DependencyLOCAL): @@ -44,34 +32,25 @@ def repo_pair(self): def __str__(self): return "{} ({})".format(self.def_path, self.def_repo[self.PARAM_URL]) - @contextmanager - def _make_repo(self, **overrides): - with external_repo(**merge(self.def_repo, overrides)) as repo: - yield repo + def _make_repo(self, *, locked=True): + from dvc.external_repo import external_repo - def _get_checksum(self, updated=False): - rev_lock = None - if not updated: - rev_lock = self.def_repo.get(self.PARAM_REV_LOCK) + d = self.def_repo + rev = (d.get("rev_lock") if locked else None) or d.get("rev") + return external_repo(d["url"], rev=rev) - try: - with self._make_repo(rev_lock=rev_lock) as repo: + def _get_checksum(self, locked=True): + with self._make_repo(locked=locked) as repo: + try: return repo.find_out_by_relpath(self.def_path).info["md5"] - except (NotDvcRepoError, NoOutputInExternalRepoError): - # Fall through and clone - pass - - repo_path = cached_clone( - self.def_repo[self.PARAM_URL], - rev=rev_lock or self.def_repo.get(self.PARAM_REV), - ) - path = PathInfo(os.path.join(repo_path, self.def_path)) - - return self.repo.cache.local.get_checksum(path) + except OutputNotFoundError: + path = PathInfo(os.path.join(repo.root_dir, self.def_path)) + # We are polluting our repo cache with some dir listing here + return self.repo.cache.local.get_checksum(path) def status(self): - current_checksum = self._get_checksum(updated=False) - updated_checksum = self._get_checksum(updated=True) + current_checksum = self._get_checksum(locked=True) + updated_checksum = self._get_checksum(locked=False) if current_checksum != updated_checksum: return {str(self): "update available"} @@ -84,71 +63,16 @@ def save(self): def dumpd(self): return {self.PARAM_PATH: self.def_path, self.PARAM_REPO: self.def_repo} - def fetch(self): - with self._make_repo( - cache_dir=self.repo.cache.local.cache_dir - ) as repo: - self.def_repo[self.PARAM_REV_LOCK] = repo.scm.get_rev() + def download(self, to): + with self._make_repo() as repo: + if self.def_repo.get(self.PARAM_REV_LOCK) is None: + self.def_repo[self.PARAM_REV_LOCK] = repo.scm.get_rev() - out = repo.find_out_by_relpath(self.def_path) - with repo.state: - try: - repo.cloud.pull(out.get_used_cache()) - except NoRemoteError: - # It would not be good idea to raise exception if the - # file is already present in the cache - if not self.repo.cache.local.changed_cache(out.checksum): - return out - raise - - return out - - @staticmethod - def _is_git_file(repo_dir, path): - from dvc.repo import Repo - - if os.path.isabs(path): - return False - - try: - repo = Repo(repo_dir) - except NotDvcRepoError: - return True - - try: - output = repo.find_out_by_relpath(path) - return not output.use_cache - except OutputNotFoundError: - return True - finally: - repo.close() - - def _copy_if_git_file(self, to_path): - src_path = self.def_path - repo_dir = cached_clone(**self.def_repo) - - if not self._is_git_file(repo_dir, src_path): - return False - - src_full_path = os.path.join(repo_dir, src_path) - dst_full_path = os.path.abspath(to_path) - fs_copy(src_full_path, dst_full_path) - self.def_repo[self.PARAM_REV_LOCK] = SCM(repo_dir).get_rev() - return True + if hasattr(repo, "cache"): + repo.cache.local.cache_dir = self.repo.cache.local.cache_dir - def download(self, to): - try: - if self._copy_if_git_file(to.fspath): - return - - out = self.fetch() - to.info = copy.copy(out.info) - to.checkout() - except (FileNotFoundError): - raise PathMissingError( - self.def_path, self.def_repo[self.PARAM_URL] - ) + repo.pull_to(self.def_path, to.path_info) def update(self): - with self._make_repo(rev_lock=None) as repo: + with self._make_repo(locked=False) as repo: self.def_repo[self.PARAM_REV_LOCK] = repo.scm.get_rev() diff --git a/dvc/exceptions.py b/dvc/exceptions.py index a551e35db1..402ee9c987 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -227,6 +227,7 @@ def __init__(self, etag, cached_etag): class FileMissingError(DvcException): def __init__(self, path): + self.path = path super().__init__( "Can't find '{}' neither locally nor on remote".format(path) ) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 9ff2f2a413..ed2e94b788 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -3,25 +3,27 @@ from contextlib import contextmanager from distutils.dir_util import copy_tree -from funcy import retry +from funcy import retry, suppress, memoize, cached_property -from dvc.config import NoRemoteError, ConfigError +from dvc.compat import fspath +from dvc.repo import Repo +from dvc.config import Config, NoRemoteError, NotDvcRepoError from dvc.exceptions import NoRemoteInExternalRepoError +from dvc.exceptions import OutputNotFoundError, NoOutputInExternalRepoError +from dvc.exceptions import FileMissingError, PathMissingError from dvc.remote import RemoteConfig -from dvc.exceptions import NoOutputInExternalRepoError -from dvc.exceptions import OutputNotFoundError -from dvc.utils.fs import remove - - -REPO_CACHE = {} +from dvc.utils.fs import remove, fs_copy +from dvc.scm import SCM @contextmanager -def external_repo(url=None, rev=None, rev_lock=None, cache_dir=None): - from dvc.repo import Repo +def external_repo(url, rev=None): + path = _cached_clone(url, rev) + try: + repo = ExternalRepo(path, url) + except NotDvcRepoError: + repo = ExternalGitRepo(path, url) - path = _external_repo(url=url, rev=rev_lock or rev, cache_dir=cache_dir) - repo = Repo(path) try: yield repo except NoRemoteError: @@ -30,28 +32,121 @@ def external_repo(url=None, rev=None, rev_lock=None, cache_dir=None): if exc.repo is repo: raise NoOutputInExternalRepoError(exc.output, repo.root_dir, url) raise - repo.close() + except FileMissingError as exc: + raise PathMissingError(exc.path, url) + finally: + repo.close() + + +def clean_repos(): + # Outside code should not see cache while we are removing + repo_paths = list(_cached_clone.memory.values()) + _cached_clone.memory.clear() + + for path in repo_paths: + _remove(path) + + +class ExternalRepo(Repo): + def __init__(self, root_dir, url): + super().__init__(root_dir) + self.url = url + self._set_upstream() + + def pull_to(self, path, to_info): + try: + out = None + with suppress(OutputNotFoundError): + out = self.find_out_by_relpath(path) + + if out and out.use_cache: + self._pull_cached(out, to_info) + return + + # Git handled files can't have absolute path + if os.path.isabs(path): + raise FileNotFoundError + + fs_copy(os.path.join(self.root_dir, path), fspath(to_info)) + except FileNotFoundError: + raise PathMissingError(path, self.url) + + def _pull_cached(self, out, to_info): + with self.state: + self.cloud.pull(out.get_used_cache()) + out.path_info = to_info + failed = out.checkout() + # This might happen when pull haven't really pulled all the files + if failed: + raise FileNotFoundError + + def _set_upstream(self): + # check if the URL is local and no default remote is present + # add default remote pointing to the original repo's cache location + if os.path.isdir(self.url): + rconfig = RemoteConfig(self.config) + if not rconfig.has_default(): + src_repo = Repo(self.url) + try: + rconfig.add( + "auto-generated-upstream", + src_repo.cache.local.cache_dir, + default=True, + level=Config.LEVEL_LOCAL, + ) + finally: + src_repo.close() + + +class ExternalGitRepo: + def __init__(self, root_dir, url): + self.root_dir = root_dir + self.url = url + + @cached_property + def scm(self): + return SCM(self.root_dir) + def close(self): + if "scm" in self.__dict__: + self.scm.close() -def cached_clone(url, rev=None, **_ignored_kwargs): + def find_out_by_relpath(self, path): + raise OutputNotFoundError(path, self) + + def pull_to(self, path, to_info): + try: + # Git handled files can't have absolute path + if os.path.isabs(path): + raise FileNotFoundError + + fs_copy(os.path.join(self.root_dir, path), fspath(to_info)) + except FileNotFoundError: + raise PathMissingError(path, self.url) + + @contextmanager + def open_by_relpath(self, path, mode="r", encoding=None): + try: + abs_path = os.path.join(self.root_dir, path) + with open(abs_path, mode, encoding) as fd: + yield fd + except FileNotFoundError: + raise PathMissingError(path, self.url) + + +@memoize +def _cached_clone(url, rev): """Clone an external git repo to a temporary directory. Returns the path to a local temporary directory with the specified revision checked out. - - Uses the REPO_CACHE to avoid accessing the remote server again if - cloning from the same URL twice in the same session. - """ - new_path = tempfile.mkdtemp("dvc-erepo") - # Copy and adjust existing clean clone - if (url, None, None) in REPO_CACHE: - old_path = REPO_CACHE[url, None, None] - + if url in _cached_clone.memory: + # Copy and an existing clean clone # This one unlike shutil.copytree() works with an existing dir - copy_tree(old_path, new_path) + copy_tree(_cached_clone.memory[url], new_path) else: # Create a new clone _clone_repo(url, new_path) @@ -59,7 +154,7 @@ def cached_clone(url, rev=None, **_ignored_kwargs): # Save clean clone dir so that we will have access to a default branch clean_clone_path = tempfile.mkdtemp("dvc-erepo") copy_tree(new_path, clean_clone_path) - REPO_CACHE[url, None, None] = clean_clone_path + _cached_clone.memory[url] = clean_clone_path # Check out the specified revision if rev is not None: @@ -68,46 +163,6 @@ def cached_clone(url, rev=None, **_ignored_kwargs): return new_path -def _external_repo(url=None, rev=None, cache_dir=None): - from dvc.config import Config - from dvc.cache import CacheConfig - from dvc.repo import Repo - - key = (url, rev, cache_dir) - if key in REPO_CACHE: - return REPO_CACHE[key] - - new_path = cached_clone(url, rev=rev) - - repo = Repo(new_path) - try: - # check if the URL is local and no default remote is present - # add default remote pointing to the original repo's cache location - if os.path.isdir(url): - rconfig = RemoteConfig(repo.config) - if not _default_remote_set(rconfig): - original_repo = Repo(url) - try: - rconfig.add( - "auto-generated-upstream", - original_repo.cache.local.cache_dir, - default=True, - level=Config.LEVEL_LOCAL, - ) - finally: - original_repo.close() - - if cache_dir is not None: - cache_config = CacheConfig(repo.config) - cache_config.set_dir(cache_dir, level=Config.LEVEL_LOCAL) - finally: - # Need to close/reopen repo to force config reread - repo.close() - - REPO_CACHE[key] = new_path - return new_path - - def _git_checkout(repo_path, revision): from dvc.scm import Git @@ -118,15 +173,6 @@ def _git_checkout(repo_path, revision): git.close() -def clean_repos(): - # Outside code should not see cache while we are removing - repo_paths = list(REPO_CACHE.values()) - REPO_CACHE.clear() - - for path in repo_paths: - _remove(path) - - def _remove(path): if os.name == "nt": # git.exe may hang for a while not permitting to remove temp dir @@ -141,19 +187,3 @@ def _clone_repo(url, path): git = Git.clone(url, path) git.close() - - -def _default_remote_set(rconfig): - """ - Checks if default remote config is present. - Args: - rconfig: a remote config - - Returns: - True if the default remote config is set, else False - """ - try: - rconfig.get_default() - return True - except ConfigError: - return False diff --git a/dvc/remote/config.py b/dvc/remote/config.py index 3401d3e82f..26c023ee12 100644 --- a/dvc/remote/config.py +++ b/dvc/remote/config.py @@ -157,3 +157,7 @@ def get_default(self, level=None): return self.config.get( Config.SECTION_CORE, Config.SECTION_CORE_REMOTE, level=level ) + + def has_default(self): + core = self.config.config[Config.SECTION_CORE] + return bool(core.get(Config.SECTION_CORE_REMOTE)) diff --git a/dvc/remote/local.py b/dvc/remote/local.py index 602e9d875c..d9e0bc69cb 100644 --- a/dvc/remote/local.py +++ b/dvc/remote/local.py @@ -61,7 +61,7 @@ def __init__(self, repo, config): cwd = config[Config.PRIVATE_CWD] cache_dir = os.path.abspath(os.path.join(cwd, cache_dir)) - self.path_info = PathInfo(cache_dir) if cache_dir else None + self.cache_dir = cache_dir self._dir_info = {} @property @@ -72,6 +72,10 @@ def state(self): def cache_dir(self): return self.path_info.fspath if self.path_info else None + @cache_dir.setter + def cache_dir(self, value): + self.path_info = PathInfo(value) if value else None + @classmethod def supported(cls, config): return True diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 9cef82e11f..9d0a506a87 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -17,7 +17,6 @@ ) from dvc.path_info import PathInfo from dvc.remote.base import RemoteActionNotImplemented -from dvc.utils import relpath from dvc.utils.fs import path_isin from .graph import check_acyclic, get_pipeline, get_pipelines @@ -428,11 +427,11 @@ def is_dvc_internal(self, path): return self.DVC_DIR in path_parts @contextmanager - def open(self, path, remote=None, mode="r", encoding=None): + def open_by_relpath(self, path, remote=None, mode="r", encoding=None): """Opens a specified resource as a file descriptor""" cause = None try: - out, = self.find_outs_by_path(path) + out = self.find_out_by_relpath(path) except OutputNotFoundError as exc: out = None cause = exc @@ -443,14 +442,15 @@ def open(self, path, remote=None, mode="r", encoding=None): yield fd return except FileNotFoundError as exc: - raise FileMissingError(relpath(path, self.root_dir)) from exc + raise FileMissingError(path) from exc - if self.tree.exists(path): - with self.tree.open(path, mode, encoding) as fd: + abs_path = os.path.join(self.root_dir, path) + if os.path.exists(abs_path): + with open(abs_path, mode=mode, encoding=encoding) as fd: yield fd return - raise FileMissingError(relpath(path, self.root_dir)) from cause + raise FileMissingError(path) from cause def _open_cached(self, out, remote=None, mode="r", encoding=None): if out.isdir(): diff --git a/dvc/repo/fetch.py b/dvc/repo/fetch.py index 8b621469b7..e995630e41 100644 --- a/dvc/repo/fetch.py +++ b/dvc/repo/fetch.py @@ -4,7 +4,6 @@ from dvc.config import NoRemoteError from dvc.exceptions import DownloadError from dvc.exceptions import OutputNotFoundError -from dvc.external_repo import external_repo from dvc.scm.base import CloneError @@ -70,11 +69,13 @@ def _fetch( def _fetch_external(self, repo_url, repo_rev, files): - failed = 0 + from dvc.external_repo import external_repo - cache_dir = self.cache.local.cache_dir + failed = 0 try: - with external_repo(repo_url, repo_rev, cache_dir=cache_dir) as repo: + with external_repo(repo_url, repo_rev) as repo: + repo.cache.local.cache_dir = self.cache.local.cache_dir + with repo.state: cache = NamedCache() for name in files: diff --git a/dvc/repo/get.py b/dvc/repo/get.py index d3292f2ff1..428e68f9af 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -3,22 +3,11 @@ import shortuuid -from dvc.exceptions import ( - DvcException, - NotDvcRepoError, - OutputNotFoundError, - PathMissingError, -) -from dvc.external_repo import ( - external_repo, - cached_clone, - NoOutputInExternalRepoError, -) -from dvc.path_info import PathInfo +from dvc.exceptions import DvcException from dvc.stage import Stage from dvc.utils import resolve_output from dvc.utils.fs import remove -from dvc.utils.fs import fs_copy +from dvc.path_info import PathInfo logger = logging.getLogger(__name__) @@ -33,6 +22,8 @@ def __init__(self): @staticmethod def get(url, path, out=None, rev=None): + from dvc.external_repo import external_repo + out = resolve_output(path, out) if Stage.is_valid_filename(out): @@ -46,8 +37,10 @@ def get(url, path, out=None, rev=None): dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: - try: - with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: + with external_repo(url=url, rev=rev) as repo: + if hasattr(repo, "cache"): + repo.cache.local.cache_dir = tmp_dir + # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we @@ -58,33 +51,7 @@ def get(url, path, out=None, rev=None): # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] - output = repo.find_out_by_relpath(path) - if output.use_cache: - _get_cached(repo, output, out) - return - # Non-cached output, fall through and try to copy from git. - except (NotDvcRepoError, NoOutputInExternalRepoError): - # Not a DVC repository or, possibly, path is not tracked by DVC. - # Fall through and try to copy from git. - pass - if os.path.isabs(path): - raise FileNotFoundError - - repo_dir = cached_clone(url, rev=rev) - - fs_copy(os.path.join(repo_dir, path), out) - except (OutputNotFoundError, FileNotFoundError): - raise PathMissingError(path, url) + repo.pull_to(path, PathInfo(out)) finally: remove(tmp_dir) - - -def _get_cached(repo, output, out): - with repo.state: - repo.cloud.pull(output.get_used_cache()) - output.path_info = PathInfo(os.path.abspath(out)) - failed = output.checkout() - # This might happen when pull haven't really pulled all the files - if failed: - raise FileNotFoundError diff --git a/tests/func/test_external_repo.py b/tests/func/test_external_repo.py index 6806a08557..4426ef1e1a 100644 --- a/tests/func/test_external_repo.py +++ b/tests/func/test_external_repo.py @@ -1,11 +1,8 @@ -import os - from mock import patch +from dvc.compat import fspath from dvc.external_repo import external_repo from dvc.scm.git import Git -from dvc.compat import fspath -from dvc.utils.fs import path_isin def test_external_repo(erepo_dir): @@ -15,20 +12,14 @@ def test_external_repo(erepo_dir): erepo_dir.dvc_gen("file", "master", commit="create file on master") url = fspath(erepo_dir) - # We will share cache dir, to fetch version file - cache_dir = erepo_dir.dvc.cache.local.cache_dir with patch.object(Git, "clone", wraps=Git.clone) as mock: - with external_repo(url, cache_dir=cache_dir) as repo: - with repo.open(os.path.join(repo.root_dir, "file")) as fd: + with external_repo(url) as repo: + with repo.open_by_relpath("file") as fd: assert fd.read() == "master" - with external_repo(url, rev="branch", cache_dir=cache_dir) as repo: - with repo.open(os.path.join(repo.root_dir, "file")) as fd: + with external_repo(url, rev="branch") as repo: + with repo.open_by_relpath("file") as fd: assert fd.read() == "branch" - # Check cache_dir is unset - with external_repo(url) as repo: - assert path_isin(repo.cache.local.cache_dir, repo.root_dir) - assert mock.call_count == 1 diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 87d8ceea3f..02245d9e01 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -6,7 +6,8 @@ from dvc.cache import Cache from dvc.config import Config from dvc.main import main -from dvc.repo.get import GetDVCFileError, PathMissingError +from dvc.exceptions import PathMissingError +from dvc.repo.get import GetDVCFileError from dvc.repo import Repo from dvc.system import System from dvc.utils.fs import makedirs diff --git a/tests/func/test_import.py b/tests/func/test_import.py index f15d211420..0c1c1b4ce8 100644 --- a/tests/func/test_import.py +++ b/tests/func/test_import.py @@ -1,20 +1,18 @@ import filecmp import os import shutil +from dvc.compat import fspath import pytest from mock import patch from dvc.cache import Cache from dvc.config import Config -from dvc.exceptions import DownloadError -from dvc.exceptions import PathMissingError -from dvc.exceptions import NoOutputInExternalRepoError +from dvc.exceptions import DownloadError, PathMissingError from dvc.config import NoRemoteError from dvc.stage import Stage from dvc.system import System from dvc.utils.fs import makedirs -from dvc.compat import fspath import dvc.data_cloud as cloud from tests.utils import trees_equal @@ -266,5 +264,5 @@ def test_import_non_existing(erepo_dir, tmp_dir, dvc): tmp_dir.dvc.imp(fspath(erepo_dir), "invalid_output") # https://github.com/iterative/dvc/pull/2837#discussion_r352123053 - with pytest.raises(NoOutputInExternalRepoError): + with pytest.raises(PathMissingError): tmp_dir.dvc.imp(fspath(erepo_dir), "/root/", "root") diff --git a/tests/func/test_status.py b/tests/func/test_status.py index 8b8d3d2ba9..7eb9819a5c 100644 --- a/tests/func/test_status.py +++ b/tests/func/test_status.py @@ -33,9 +33,7 @@ def test_status_non_dvc_repo_import(tmp_dir, dvc, erepo_dir): with erepo_dir.branch("branch", new=True), erepo_dir.chdir(): erepo_dir.scm.repo.index.remove([".dvc"], r=True) shutil.rmtree(".dvc") - erepo_dir.scm_gen("file", "first version") - erepo_dir.scm.add(["file"]) - erepo_dir.scm.commit("first version") + erepo_dir.scm_gen("file", "first version", commit="first version") dvc.imp(fspath(erepo_dir), "file", "file", rev="branch") @@ -49,8 +47,6 @@ def test_status_non_dvc_repo_import(tmp_dir, dvc, erepo_dir): with erepo_dir.branch("branch", new=False), erepo_dir.chdir(): erepo_dir.scm_gen("file", "second_version", commit="update file") - erepo_dir.scm.add(["file"]) - erepo_dir.scm.commit("first version") status, = dvc.status(["file.dvc"])["file.dvc"] @@ -65,9 +61,7 @@ def test_status_before_and_after_dvc_init(tmp_dir, dvc, erepo_dir): with erepo_dir.chdir(): erepo_dir.scm.repo.index.remove([".dvc"], r=True) shutil.rmtree(".dvc") - erepo_dir.scm_gen("file", "first version") - erepo_dir.scm.add(["file"]) - erepo_dir.scm.commit("first version") + erepo_dir.scm_gen("file", "first version", commit="first verison") old_rev = erepo_dir.scm.get_rev() dvc.imp(fspath(erepo_dir), "file", "file") diff --git a/tests/func/test_update.py b/tests/func/test_update.py index 6f483fff0a..769bc1a756 100644 --- a/tests/func/test_update.py +++ b/tests/func/test_update.py @@ -114,9 +114,7 @@ def test_update_before_and_after_dvc_init(tmp_dir, dvc, erepo_dir): with erepo_dir.chdir(): erepo_dir.scm.repo.index.remove([".dvc"], r=True) shutil.rmtree(".dvc") - erepo_dir.scm_gen("file", "first version") - erepo_dir.scm.add(["file"]) - erepo_dir.scm.commit("first version") + erepo_dir.scm_gen("file", "first version", commit="first version") old_rev = erepo_dir.scm.get_rev() stage = dvc.imp(fspath(erepo_dir), "file", "file") From 9de0643d971e310ee06382501ba972158efac2f8 Mon Sep 17 00:00:00 2001 From: Alexander Schepanovski Date: Sat, 18 Jan 2020 22:38:55 +0700 Subject: [PATCH 2/5] api: allow rev on local repos --- dvc/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/api.py b/dvc/api.py index 9f3912745f..f6082f68c1 100644 --- a/dvc/api.py +++ b/dvc/api.py @@ -109,7 +109,7 @@ def read(path, repo=None, rev=None, remote=None, mode="r", encoding=None): @contextmanager def _make_repo(repo_url, rev=None): - if not repo_url or os.path.exists(repo_url): + if rev is None and (not repo_url or os.path.exists(repo_url)): yield Repo(repo_url) else: with external_repo(url=repo_url, rev=rev) as repo: From ee022eada082ee28b4027a4317f3d63024aff07a Mon Sep 17 00:00:00 2001 From: Alexander Schepanovski Date: Sat, 18 Jan 2020 22:39:52 +0700 Subject: [PATCH 3/5] api: adjust get_url requiring dvc repo to the new reality --- tests/func/test_api.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/func/test_api.py b/tests/func/test_api.py index b3f0c0aba2..0512493f36 100644 --- a/tests/func/test_api.py +++ b/tests/func/test_api.py @@ -6,7 +6,7 @@ import pytest from dvc import api -from dvc.api import SummonError, UrlNotDvcRepoError +from dvc.api import SummonError, NotDvcRepoError, UrlNotDvcRepoError from dvc.compat import fspath from dvc.exceptions import FileMissingError from dvc.main import main @@ -51,14 +51,14 @@ def test_get_url_external(erepo_dir, remote_url): assert api.get_url("foo", repo=repo_url) == expected_url -def test_get_url_git_only_repo_throws_exception(tmp_dir, scm): +def test_get_url_requires_dvc(tmp_dir, scm): tmp_dir.scm_gen({"foo": "foo"}, commit="initial") - with pytest.raises(UrlNotDvcRepoError) as exc_info: - api.get_url("foo", fspath(tmp_dir)) + with pytest.raises(NotDvcRepoError, match="not inside of a dvc repo"): + api.get_url("foo", repo=fspath(tmp_dir)) - # On windows, `exc_info` has path escaped, eg: `C:\\\\Users\\\\travis` - assert fspath(tmp_dir) in str(exc_info).replace("\\\\", "\\") + with pytest.raises(UrlNotDvcRepoError): + api.get_url("foo", repo="file://{}".format(tmp_dir)) @pytest.mark.parametrize("remote_url", all_remote_params, indirect=True) From eb97e0b3939ad40f9a66c6cddea729f9263dcd67 Mon Sep 17 00:00:00 2001 From: Alexander Schepanovski Date: Sun, 19 Jan 2020 18:35:38 +0700 Subject: [PATCH 4/5] external: do not fail with no remote if cache is available --- dvc/external_repo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index ed2e94b788..470646c6e5 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -73,7 +73,10 @@ def pull_to(self, path, to_info): def _pull_cached(self, out, to_info): with self.state: - self.cloud.pull(out.get_used_cache()) + # Only pull unless all needed cache is present + if out.changed_cache(): + self.cloud.pull(out.get_used_cache()) + out.path_info = to_info failed = out.checkout() # This might happen when pull haven't really pulled all the files From 72812b519ecfbd48531040c63c74d4cbe6e5b1c6 Mon Sep 17 00:00:00 2001 From: Alexander Schepanovski Date: Sun, 19 Jan 2020 22:14:56 +0700 Subject: [PATCH 5/5] api: fix get_url() non-dvc repo error message --- dvc/api.py | 4 ++-- tests/func/test_api.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dvc/api.py b/dvc/api.py index f6082f68c1..e1f87d09c1 100644 --- a/dvc/api.py +++ b/dvc/api.py @@ -10,7 +10,7 @@ from voluptuous import Schema, Required, Invalid from dvc.repo import Repo -from dvc.exceptions import DvcException, NotDvcRepoError +from dvc.exceptions import DvcException from dvc.external_repo import external_repo @@ -43,7 +43,7 @@ class SummonError(DvcException): pass -class UrlNotDvcRepoError(NotDvcRepoError): +class UrlNotDvcRepoError(DvcException): """Thrown if given url is not a DVC repository. Args: diff --git a/tests/func/test_api.py b/tests/func/test_api.py index 0512493f36..931c1a8f71 100644 --- a/tests/func/test_api.py +++ b/tests/func/test_api.py @@ -6,9 +6,9 @@ import pytest from dvc import api -from dvc.api import SummonError, NotDvcRepoError, UrlNotDvcRepoError +from dvc.api import SummonError, UrlNotDvcRepoError from dvc.compat import fspath -from dvc.exceptions import FileMissingError +from dvc.exceptions import FileMissingError, NotDvcRepoError from dvc.main import main from dvc.path_info import URLInfo from dvc.remote.config import RemoteConfig