Skip to content

Commit

Permalink
Revert "remote: base: don't checkout existing file (iterative#2358)"
Browse files Browse the repository at this point in the history
This reverts commit 5ec1d27.

Save tests to try make them work, and `_get_cache_type()` to be used
later.
  • Loading branch information
Suor committed Dec 2, 2019
1 parent 4ab4e43 commit 30d58e1
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 76 deletions.
60 changes: 19 additions & 41 deletions dvc/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -758,45 +758,20 @@ def safe_remove(self, path_info, force=False):
self.remove(path_info)

def _checkout_file(
self,
path_info,
checksum,
force,
progress_callback=None,
save_link=True,
self, path_info, checksum, force, progress_callback=None
):
# NOTE: In case if path_info is already cached and path_info's
# link type matches cache link type, we would like to avoid
# relinking.
if self.changed(
path_info, {self.PARAM_CHECKSUM: checksum}
) or not self._link_matches(path_info):
cache_info = self.checksum_to_path_info(checksum)
if self.exists(path_info):
msg = "data '{}' exists. Removing before checkout."
logger.warning(msg.format(str(path_info)))
self.safe_remove(path_info, force=force)

cache_info = self.checksum_to_path_info(checksum)
self.link(cache_info, path_info)

if save_link:
self.state.save_link(path_info)

self.state.save(path_info, checksum)
else:
# NOTE: performing (un)protection costs us +/- the same as checking
# if path_info is protected. Instead of implementing logic,
# just (un)protect according to self.protected.
if self.protected:
self.protect(path_info)
else:
# NOTE dont allow copy, because we checked before that link
# type matches cache, and we don't want data duplication
self.unprotect(path_info, allow_copy=False)

self.link(cache_info, path_info)
self.state.save_link(path_info)
self.state.save(path_info, checksum)
if progress_callback:
progress_callback(str(path_info))

def _link_matches(self, path_info):
return True

def makedirs(self, path_info):
"""Optional: Implement only if the remote needs to create
directories before copying/linking/moving data
Expand All @@ -818,14 +793,17 @@ def _checkout_dir(
for entry in dir_info:
relative_path = entry[self.PARAM_RELPATH]
entry_checksum = entry[self.PARAM_CHECKSUM]
entry_cache_info = self.checksum_to_path_info(entry_checksum)
entry_info = path_info / relative_path
self._checkout_file(
entry_info,
entry_checksum,
force,
progress_callback,
save_link=False,
)

entry_checksum_info = {self.PARAM_CHECKSUM: entry_checksum}
if self.changed(entry_info, entry_checksum_info):
if self.exists(entry_info):
self.safe_remove(entry_info, force=force)
self.link(entry_cache_info, entry_info)
self.state.save(entry_info, entry_checksum)
if progress_callback:
progress_callback(str(entry_info))

self._remove_redundant_files(path_info, dir_info, force)

Expand Down Expand Up @@ -904,7 +882,7 @@ def get_files_number(self, checksum):
return 1

@staticmethod
def unprotect(path_info, allow_copy=True):
def unprotect(path_info):
pass

def _get_unpacked_dir_names(self, checksums):
Expand Down
52 changes: 17 additions & 35 deletions dvc/remote/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,22 +394,19 @@ def _log_missing_caches(checksum_info_dict):
logger.warning(msg)

@staticmethod
def _unprotect_file(path, allow_copy=True):
def _unprotect_file(path):
if System.is_symlink(path) or System.is_hardlink(path):
if allow_copy:
logger.debug("Unprotecting '{}'".format(path))
tmp = os.path.join(os.path.dirname(path), "." + str(uuid()))

# The operations order is important here - if some application
# would access the file during the process of copyfile then it
# would get only the part of file. So, at first, the file
# should be copied with the temporary name, and then
# original file should be replaced by new.
copyfile(
path, tmp, name="Unprotecting '{}'".format(relpath(path))
)
remove(path)
os.rename(tmp, path)
logger.debug("Unprotecting '{}'".format(path))
tmp = os.path.join(os.path.dirname(path), "." + str(uuid()))

# The operations order is important here - if some application
# would access the file during the process of copyfile then it
# would get only the part of file. So, at first, the file should be
# copied with the temporary name, and then original file should be
# replaced by new.
copyfile(path, tmp, name="Unprotecting '{}'".format(relpath(path)))
remove(path)
os.rename(tmp, path)

else:
logger.debug(
Expand All @@ -419,21 +416,21 @@ def _unprotect_file(path, allow_copy=True):

os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE)

def _unprotect_dir(self, path, allow_copy=True):
def _unprotect_dir(self, path):
for fname in walk_files(path, self.repo.dvcignore):
self._unprotect_file(fname, allow_copy)
RemoteLOCAL._unprotect_file(fname)

def unprotect(self, path_info, allow_copy=True):
def unprotect(self, path_info):
path = path_info.fspath
if not os.path.exists(path):
raise DvcException(
"can't unprotect non-existing data '{}'".format(path)
)

if os.path.isdir(path):
self._unprotect_dir(path, allow_copy)
self._unprotect_dir(path)
else:
self._unprotect_file(path, allow_copy)
RemoteLOCAL._unprotect_file(path)

@staticmethod
def protect(path_info):
Expand Down Expand Up @@ -525,18 +522,3 @@ def _get_cache_type(self, path_info):

self.cache_type_confirmed = True
return self.cache_types[0]

def _link_matches(self, path_info):
is_hardlink = System.is_hardlink(path_info)
is_symlink = System.is_symlink(path_info)
is_copy_or_reflink = not is_hardlink and not is_symlink

cache_type = self._get_cache_type(path_info)

if cache_type == "symlink":
return is_symlink

if cache_type == "hardlink":
return is_hardlink

return is_copy_or_reflink

0 comments on commit 30d58e1

Please sign in to comment.