From 9d0cfcafdfeeec37a021a9f6a065ec63f82ec5e2 Mon Sep 17 00:00:00 2001 From: Vera Sativa Date: Thu, 5 Dec 2019 11:05:47 -0300 Subject: [PATCH 1/7] feat: [WIP] remote dir download --- dvc/remote/base.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 10969250a0..77bca46940 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -547,11 +547,20 @@ def download( if to_info.scheme != "local": raise NotImplementedError - logger.debug("Downloading '{}' to '{}'".format(from_info, to_info)) + makedirs(to_info.parent, exist_ok=True, mode=dir_mode) + + if self.isdir(from_info): + # need to create dir here + # also pass the individual file name (file and to) + for file_info in self.walk_files(from_info): + self.single_file_download(file_info, to_info, name, no_progress_bar, file_mode) + else: + self.single_file_download(from_info, to_info, name, no_progress_bar, file_mode) + def single_file_download(self, from_info, to_info, name, no_progress_bar, file_mode): + logger.debug("Downloading '{}' to '{}'".format(from_info, to_info)) name = name or to_info.name - makedirs(to_info.parent, exist_ok=True, mode=dir_mode) tmp_file = tmp_fname(to_info) try: @@ -567,6 +576,7 @@ def download( return 0 + def open(self, path_info, mode="r", encoding=None): if hasattr(self, "_generate_download_url"): get_url = partial(self._generate_download_url, path_info) From f6c719a6f30f0c989b3c39d07746d609ab6b63bc Mon Sep 17 00:00:00 2001 From: Vera Sativa Date: Thu, 5 Dec 2019 14:53:01 -0300 Subject: [PATCH 2/7] feat: [WIP] dir download working single-thread --- dvc/remote/base.py | 51 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 77bca46940..6813c55e0b 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -30,6 +30,7 @@ from dvc.utils.compat import urlparse from dvc.utils.fs import move from dvc.utils.http import open_url +from pathlib import Path logger = logging.getLogger(__name__) @@ -549,15 +550,50 @@ def download( makedirs(to_info.parent, exist_ok=True, mode=dir_mode) + print(from_info) + if self.isdir(from_info): - # need to create dir here - # also pass the individual file name (file and to) - for file_info in self.walk_files(from_info): - self.single_file_download(file_info, to_info, name, no_progress_bar, file_mode) + makedirs(to_info, exist_ok=True, mode=dir_mode) + # TEMP limits to test + limit = 5 + current = 0 + # / TEMP limits to test + for file_from_info in self.walk_files(from_info): + # TEMP limits to test + current += 1 + if current == limit: + exit() + # / TEMP limits to test + + file_to_info = Path( + str(to_info) + + "/" + + str(file_from_info)[len(str(from_info)) + 1 :] + ) + + self.single_file_download( + file_from_info, + file_to_info, + name, + no_progress_bar, + file_mode, + dir_mode, + ) else: - self.single_file_download(from_info, to_info, name, no_progress_bar, file_mode) + self.single_file_download( + from_info, to_info, name, no_progress_bar, file_mode, dir_mode + ) + + def single_file_download( + self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode + ): + # Make sure full path exist + # print(to_info) + for parent in reversed(to_info.parents): + if not parent.exists(): + # print('making dir: {}'.format(parent)) + makedirs(parent, exist_ok=True, mode=dir_mode) - def single_file_download(self, from_info, to_info, name, no_progress_bar, file_mode): logger.debug("Downloading '{}' to '{}'".format(from_info, to_info)) name = name or to_info.name @@ -568,7 +604,7 @@ def single_file_download(self, from_info, to_info, name, no_progress_bar, file_m from_info, tmp_file, name=name, no_progress_bar=no_progress_bar ) except Exception: - msg = "failed to download '{}' to '{}'" + msg = "failed to doooooownload '{}' to '{}'" logger.exception(msg.format(from_info, to_info)) return 1 # 1 fail @@ -576,7 +612,6 @@ def single_file_download(self, from_info, to_info, name, no_progress_bar, file_m return 0 - def open(self, path_info, mode="r", encoding=None): if hasattr(self, "_generate_download_url"): get_url = partial(self._generate_download_url, path_info) From cf6da7bd8c18078196ee17cae1b017e3832a5e93 Mon Sep 17 00:00:00 2001 From: Vera Sativa Date: Thu, 5 Dec 2019 16:28:35 -0300 Subject: [PATCH 3/7] import-url: support directories: multi-thread working --- dvc/remote/base.py | 57 ++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 6813c55e0b..c22f30b5a1 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -30,7 +30,7 @@ from dvc.utils.compat import urlparse from dvc.utils.fs import move from dvc.utils.http import open_url -from pathlib import Path + logger = logging.getLogger(__name__) @@ -550,35 +550,33 @@ def download( makedirs(to_info.parent, exist_ok=True, mode=dir_mode) - print(from_info) - if self.isdir(from_info): makedirs(to_info, exist_ok=True, mode=dir_mode) - # TEMP limits to test - limit = 5 - current = 0 - # / TEMP limits to test - for file_from_info in self.walk_files(from_info): - # TEMP limits to test - current += 1 - if current == limit: - exit() - # / TEMP limits to test - - file_to_info = Path( - str(to_info) - + "/" - + str(file_from_info)[len(str(from_info)) + 1 :] - ) + file_to_infos = ( + to_info / file_to_info.relative_to(from_info) + for file_to_info in self.walk_files(from_info) + ) - self.single_file_download( + with ThreadPoolExecutor(max_workers=self.JOBS) as executor: + file_from_info = list(self.walk_files(from_info)) + with Tqdm( file_from_info, - file_to_info, - name, - no_progress_bar, - file_mode, - dir_mode, - ) + total=len(file_from_info), + desc="Downloading directory", + ) as file_from_info: + return sum( + executor.map( + partial( + self.single_file_download, + name=name, + no_progress_bar=True, + file_mode=file_mode, + dir_mode=dir_mode, + ), + file_from_info, + file_to_infos, + ) + ) else: self.single_file_download( from_info, to_info, name, no_progress_bar, file_mode, dir_mode @@ -587,12 +585,7 @@ def download( def single_file_download( self, from_info, to_info, name, no_progress_bar, file_mode, dir_mode ): - # Make sure full path exist - # print(to_info) - for parent in reversed(to_info.parents): - if not parent.exists(): - # print('making dir: {}'.format(parent)) - makedirs(parent, exist_ok=True, mode=dir_mode) + makedirs(to_info.parent, exist_ok=True, mode=dir_mode) logger.debug("Downloading '{}' to '{}'".format(from_info, to_info)) name = name or to_info.name From 3af4f85310588c19cbd80e0abb4cd4aad145a5e2 Mon Sep 17 00:00:00 2001 From: Vera <9991536+verasativa@users.noreply.github.com> Date: Thu, 5 Dec 2019 16:45:13 -0300 Subject: [PATCH 4/7] Update dvc/remote/base.py Co-Authored-By: Ruslan Kuprieiev --- dvc/remote/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index c22f30b5a1..8922a5787e 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -548,7 +548,6 @@ def download( if to_info.scheme != "local": raise NotImplementedError - makedirs(to_info.parent, exist_ok=True, mode=dir_mode) if self.isdir(from_info): makedirs(to_info, exist_ok=True, mode=dir_mode) From 164ded64532c8e87afbeb35dc21814c184571572 Mon Sep 17 00:00:00 2001 From: Vera <9991536+verasativa@users.noreply.github.com> Date: Thu, 5 Dec 2019 16:45:43 -0300 Subject: [PATCH 5/7] Update dvc/remote/base.py Co-Authored-By: Ruslan Kuprieiev --- dvc/remote/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 8922a5787e..6cebec7c3a 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -550,7 +550,6 @@ def download( if self.isdir(from_info): - makedirs(to_info, exist_ok=True, mode=dir_mode) file_to_infos = ( to_info / file_to_info.relative_to(from_info) for file_to_info in self.walk_files(from_info) From f7e0b575bd9dfe568deb797dfb216afa3eacac67 Mon Sep 17 00:00:00 2001 From: Vera Sativa Date: Thu, 5 Dec 2019 16:51:24 -0300 Subject: [PATCH 6/7] tests: add download_dir --- tests/unit/remote/test_remote_dir.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/unit/remote/test_remote_dir.py b/tests/unit/remote/test_remote_dir.py index 583898a60d..9e740a9684 100644 --- a/tests/unit/remote/test_remote_dir.py +++ b/tests/unit/remote/test_remote_dir.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- import pytest +import os from dvc.remote.s3 import RemoteS3 @@ -132,3 +133,12 @@ def test_isfile(remote): for expected, path in test_cases: assert remote.isfile(remote.path_info / path) == expected + + +@pytest.mark.parametrize("remote", remotes, indirect=True) +def test_download_dir(remote, tmpdir): + path = os.fspath(tmpdir / "data") + to_info = os.PathInfo(path) + remote.download(remote.path_info / "data", to_info) + assert os.path.isdir(path) + # check the list of files From 74002fccb4d536271c8d8e6347396d00504f60cb Mon Sep 17 00:00:00 2001 From: "Restyled.io" Date: Thu, 5 Dec 2019 19:51:42 +0000 Subject: [PATCH 7/7] Restyled by black --- dvc/remote/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 6cebec7c3a..10f21d1ea0 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -548,7 +548,6 @@ def download( if to_info.scheme != "local": raise NotImplementedError - if self.isdir(from_info): file_to_infos = ( to_info / file_to_info.relative_to(from_info)