From 77fa865c846663f24ac314fd2281ec4c55bf2046 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 20 Jul 2020 18:03:54 +0200 Subject: [PATCH 01/16] tree,remote: add support for webdav Webdav support is based on https://pypi.org/project/webdavclient3/ and supports basic download/upload operation, directory creation as well as existence, file hash and isdir query. Copy, move and remove are also implemented, though probably not used yet. WebdavURLInfo is taken from https://github.com/shizacat/dvc/tree/remote-webdav Fixes iterative#1153 --- dvc/config.py | 9 +++ dvc/path_info.py | 14 ++++ dvc/scheme.py | 2 + dvc/tree/__init__.py | 4 ++ dvc/tree/webdav.py | 157 +++++++++++++++++++++++++++++++++++++++++++ dvc/tree/webdavs.py | 7 ++ setup.py | 4 +- 7 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 dvc/tree/webdav.py create mode 100644 dvc/tree/webdavs.py diff --git a/dvc/config.py b/dvc/config.py index e969c104b8..ea7a629c1e 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -119,6 +119,13 @@ class RelPath(str): "password": str, "ask_password": Bool, } +WEBDAV_COMMON = { + "root": str, + "user": str, + "password": str, + "ask_password": Bool, +} + SCHEMA = { "core": { "remote": Lower, @@ -199,6 +206,8 @@ class RelPath(str): }, "http": {**HTTP_COMMON, **REMOTE_COMMON}, "https": {**HTTP_COMMON, **REMOTE_COMMON}, + "webdav": {**WEBDAV_COMMON, **REMOTE_COMMON}, + "webdavs": {**WEBDAV_COMMON, **REMOTE_COMMON}, "remote": {str: object}, # Any of the above options are valid } ) diff --git a/dvc/path_info.py b/dvc/path_info.py index 2420764ba4..13fb84fe31 100644 --- a/dvc/path_info.py +++ b/dvc/path_info.py @@ -315,3 +315,17 @@ def __eq__(self, other): and self._path == other._path and self._extra_parts == other._extra_parts ) + + +# See https://github.com/shizacat/dvc/blob/remote-webdav/dvc/path_info.py +class WebdavURLInfo(HTTPURLInfo): + @cached_property + def url(self): + return "{}://{}{}{}{}{}".format( + self.scheme.replace("webdav", "http"), + self.netloc, + self._spath, + (";" + self.params) if self.params else "", + ("?" + self.query) if self.query else "", + ("#" + self.fragment) if self.fragment else "", + ) diff --git a/dvc/scheme.py b/dvc/scheme.py index e64e24f5ac..76c6d7a497 100644 --- a/dvc/scheme.py +++ b/dvc/scheme.py @@ -9,3 +9,5 @@ class Schemes: GDRIVE = "gdrive" LOCAL = "local" OSS = "oss" + WEBDAV = "webdav" + WEBDAVS = "webdavs" diff --git a/dvc/tree/__init__.py b/dvc/tree/__init__.py index 24fcac4b0c..efef52b3e6 100644 --- a/dvc/tree/__init__.py +++ b/dvc/tree/__init__.py @@ -11,6 +11,8 @@ from .oss import OSSTree from .s3 import S3Tree from .ssh import SSHTree +from .webdav import WebdavTree +from .webdavs import WebdavsTree TREES = [ AzureTree, @@ -22,6 +24,8 @@ S3Tree, SSHTree, OSSTree, + WebdavTree, + WebdavsTree, # NOTE: LocalTree is the default ] diff --git a/dvc/tree/webdav.py b/dvc/tree/webdav.py new file mode 100644 index 0000000000..ac95b05956 --- /dev/null +++ b/dvc/tree/webdav.py @@ -0,0 +1,157 @@ +import logging +import threading + +from funcy import cached_property, wrap_prop + +from dvc.exceptions import DvcException +from dvc.path_info import WebdavURLInfo +from dvc.scheme import Schemes + +from .base import BaseTree +from .http import ask_password + +logger = logging.getLogger(__name__) + + +class WebdavTree(BaseTree): # pylint:disable=abstract-method + # Use webdav scheme + scheme = Schemes.WEBDAV + + # URLInfo for Webdav ~ replaces webdav -> http + PATH_CLS = WebdavURLInfo + + # Non traversable as walk_files is not implemented + CAN_TRAVERSE = False + + # Implementation based on webdav3.client + REQUIRES = {"webdavclient3": "webdav3.client"} + + # Constructor + def __init__(self, repo, config): + # Call BaseTree constructor + super().__init__(repo, config) + + # Get password from configuration (might be None ~ not set) + self.password = config.get("password", None) + + # Whether to ask for password is it is not set + self.ask_password = config.get("ask_password", False) + + # Webdav root directory + self.root = config.get("root", "/") + + # From HTTPTree + url = config.get("url") + if url: + self.path_info = self.PATH_CLS(url) + user = config.get("user", None) + if user: + self.path_info.user = user + else: + self.path_info = None + + # Webdav client + @wrap_prop(threading.Lock()) + @cached_property + def _client(self): + # Import the webdav client library + from webdav3.client import Client + + # Construct hostname from path_info + hostname = ( + self.path_info.scheme.replace("webdav", "http") + + "://" + + self.path_info.host + ) + + # Set password or ask for it + if self.ask_password and self.password is None: + host, user = self.path_info.host, self.path_info.user + self.password = ask_password(host, user) + + # Setup webdav client options dictionary + options = { + "webdav_hostname": hostname, + "webdav_root": self.root, + "webdav_login": self.path_info.user, + "webdav_password": self.password, + } + + # Create a webdav client as configured + return Client(options) + + # Checks whether file exists + def exists(self, path_info): + # Use webdav check to test for file existence + return self._client.check(path_info.path) + + # Gets file hash 'etag' + def get_file_hash(self, path_info): + # Use webdav client info method to get etag + etag = self._client.info(path_info.path)["etag"].strip('"') + + # From HTTPTree + if not etag: + raise DvcException( + "could not find an ETag or " + "Content-MD5 header for '{url}'".format(url=path_info.url) + ) + + if etag.startswith("W/"): + raise DvcException( + "Weak ETags are not supported." + " (Etag: '{etag}', URL: '{url}')".format( + etag=etag, url=path_info.url + ) + ) + + return etag + + # Checks whether path points to directory + def isdir(self, path_info): + # Use webdav is_dir to test whether path points to a directory + return self._client.is_dir(path_info.path) + + # Removes file/directory + def remove(self, path_info): + # Use webdav client clean (DELETE) method to remove file/directory + self._client.clean(path_info.path) + + # Creates directories + def makedirs(self, path_info): + # Terminate recursion + if path_info.path == "/": + return + + # Recursively descent to root + self.makedirs(path_info.parent) + + # Construct directory at current recursion depth + self._client.mkdir(path_info.path) + + # Moves file/directory at remote + def move(self, from_info, to_info, mode=None): + # Webdav client move + self._client.move(from_info.path, to_info.path) + + # Copies file/directory at remote + def copy(self, from_info, to_info): + # Webdav client copy + self._client.copy(from_info.path, to_info.path) + + # Downloads file from remote to file + def _download(self, from_info, to_file, name=None, no_progress_bar=False): + # pylint: disable=unused-argument + + # Webdav client download + self._client.download(from_info.path, to_file) + + # Uploads file to remote + def _upload(self, from_file, to_info, name=None, no_progress_bar=False): + # pylint: disable=unused-argument + + # First try to create parent directories + self.makedirs(to_info.parent) + + # Now upload the file + self._client.upload(to_info.path, from_file) diff --git a/dvc/tree/webdavs.py b/dvc/tree/webdavs.py new file mode 100644 index 0000000000..68e8891b27 --- /dev/null +++ b/dvc/tree/webdavs.py @@ -0,0 +1,7 @@ +from dvc.scheme import Schemes + +from .webdav import WebdavTree + + +class WebdavsTree(WebdavTree): # pylint:disable=abstract-method + scheme = Schemes.WEBDAVS diff --git a/setup.py b/setup.py index 064cb12618..152e23cc48 100644 --- a/setup.py +++ b/setup.py @@ -93,12 +93,13 @@ def run(self): oss = ["oss2==2.6.1"] ssh = ["paramiko>=2.5.0"] hdfs = ["pyarrow>=0.17.0"] +webdav = ["webdavclient3==3.14.5"] # gssapi should not be included in all_remotes, because it doesn't have wheels # for linux and mac, so it will fail to compile if user doesn't have all the # requirements, including kerberos itself. Once all the wheels are available, # we can start shipping it by default. ssh_gssapi = ["paramiko[gssapi]>=2.5.0"] -all_remotes = gs + s3 + azure + ssh + oss + gdrive + hdfs +all_remotes = gs + s3 + azure + ssh + oss + gdrive + hdfs + webdav # Extra dependecies to run tests tests_requirements = [ @@ -160,6 +161,7 @@ def run(self): "ssh": ssh, "ssh_gssapi": ssh_gssapi, "hdfs": hdfs, + "webdav": webdav, "tests": tests_requirements, }, keywords="data-science data-version-control machine-learning git" From 66a77e82cb9af052b88416edbb912fb7303ab7e5 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 20 Jul 2020 22:13:00 +0200 Subject: [PATCH 02/16] tree,remote: add further webdavclient3 options Webdav token auth, certificate and key path and connection timeout are configurable. Webdav username might be specified or extracted from URL. Refs iterative#1153 --- dvc/config.py | 6 +++++- dvc/tree/webdav.py | 46 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/dvc/config.py b/dvc/config.py index ea7a629c1e..416eb82eac 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -120,10 +120,14 @@ class RelPath(str): "ask_password": Bool, } WEBDAV_COMMON = { - "root": str, "user": str, "password": str, "ask_password": Bool, + "root": str, + "token": str, + "cert_path": str, + "key_path": str, + "timeout": Coerce(int), } SCHEMA = { diff --git a/dvc/tree/webdav.py b/dvc/tree/webdav.py index ac95b05956..50cba038d0 100644 --- a/dvc/tree/webdav.py +++ b/dvc/tree/webdav.py @@ -31,6 +31,9 @@ def __init__(self, repo, config): # Call BaseTree constructor super().__init__(repo, config) + # Get username from configuration + self.user = config.get("user", None) + # Get password from configuration (might be None ~ not set) self.password = config.get("password", None) @@ -40,13 +43,32 @@ def __init__(self, repo, config): # Webdav root directory self.root = config.get("root", "/") - # From HTTPTree - url = config.get("url") - if url: - self.path_info = self.PATH_CLS(url) - user = config.get("user", None) - if user: - self.path_info.user = user + # Use token for webdav auth + self.token = config.get("token", None) + + # Path to certificate + self.cert_path = config.get("cert_path", None) + + # Path to private key + self.key_path = config.get("key_path", None) + + # Connection timeout + self.timeout = config.get("timeout", 30) + + # Get URL from configuration + self.url = config.get("url", None) + + # If URL in config parse path_info + if self.url: + self.path_info = self.PATH_CLS(self.url) + + # If username not specified try to use from URL + if self.user is None and self.path_info.user is not None: + self.user = self.path_info.user + + # If username specified add to path_info + if self.user is not None: + self.path_info.user = self.user else: self.path_info = None @@ -65,16 +87,20 @@ def _client(self): ) # Set password or ask for it - if self.ask_password and self.password is None: + if self.ask_password and self.password is None and self.token is None: host, user = self.path_info.host, self.path_info.user self.password = ask_password(host, user) # Setup webdav client options dictionary options = { "webdav_hostname": hostname, - "webdav_root": self.root, - "webdav_login": self.path_info.user, + "webdav_login": self.user, "webdav_password": self.password, + "webdav_token": self.token, + "webdav_root": self.root, + "webdav_cert_path": self.cert_path, + "webdav_key_path": self.key_path, + "webdav_timeout": self.timeout, } # Create a webdav client as configured From 5c65291fbe7d64d0fee04f21bb68151227126ea8 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 21 Jul 2020 18:46:01 +0200 Subject: [PATCH 03/16] tree,remote: validate webdav client configuration Refs iterative#1153 --- dvc/exceptions.py | 5 +++++ dvc/tree/webdav.py | 11 +++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/dvc/exceptions.py b/dvc/exceptions.py index 204c378fdd..ff488125c6 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -299,6 +299,11 @@ def __init__(self, code, reason): super().__init__(f"'{code} {reason}'") +class WebdavConfigError(DvcException): + def __init__(self, host): + super().__init__(f"Configuration for WebDAV {host} is invalid.") + + class PathMissingError(DvcException): default_msg = ( "The path '{}' does not exist in the target repository '{}'" diff --git a/dvc/tree/webdav.py b/dvc/tree/webdav.py index 50cba038d0..daa0d2a33f 100644 --- a/dvc/tree/webdav.py +++ b/dvc/tree/webdav.py @@ -3,7 +3,7 @@ from funcy import cached_property, wrap_prop -from dvc.exceptions import DvcException +from dvc.exceptions import DvcException, WebdavConfigError from dvc.path_info import WebdavURLInfo from dvc.scheme import Schemes @@ -104,7 +104,14 @@ def _client(self): } # Create a webdav client as configured - return Client(options) + client = Client(options) + + # Check whether client options are valid + if not client.valid(): + raise WebdavConfigError(hostname) + + # Return constructed client (cached) + return client # Checks whether file exists def exists(self, path_info): From 73b5fbe493d81351c1e383ccfca0f8702cbfa4b0 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 21 Jul 2020 18:52:29 +0200 Subject: [PATCH 04/16] tree,remote: WebDAV is written with capitalized 'DAV'... Refs iterative#1153 --- dvc/exceptions.py | 2 +- dvc/path_info.py | 2 +- dvc/tree/__init__.py | 8 ++++---- dvc/tree/webdav.py | 10 +++++----- dvc/tree/webdavs.py | 4 ++-- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/dvc/exceptions.py b/dvc/exceptions.py index ff488125c6..b5b49ba1db 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -299,7 +299,7 @@ def __init__(self, code, reason): super().__init__(f"'{code} {reason}'") -class WebdavConfigError(DvcException): +class WebDAVConfigError(DvcException): def __init__(self, host): super().__init__(f"Configuration for WebDAV {host} is invalid.") diff --git a/dvc/path_info.py b/dvc/path_info.py index 13fb84fe31..281882a9bf 100644 --- a/dvc/path_info.py +++ b/dvc/path_info.py @@ -318,7 +318,7 @@ def __eq__(self, other): # See https://github.com/shizacat/dvc/blob/remote-webdav/dvc/path_info.py -class WebdavURLInfo(HTTPURLInfo): +class WebDAVURLInfo(HTTPURLInfo): @cached_property def url(self): return "{}://{}{}{}{}{}".format( diff --git a/dvc/tree/__init__.py b/dvc/tree/__init__.py index efef52b3e6..b9a38325c5 100644 --- a/dvc/tree/__init__.py +++ b/dvc/tree/__init__.py @@ -11,8 +11,8 @@ from .oss import OSSTree from .s3 import S3Tree from .ssh import SSHTree -from .webdav import WebdavTree -from .webdavs import WebdavsTree +from .webdav import WebDAVTree +from .webdavs import WebDAVSTree TREES = [ AzureTree, @@ -24,8 +24,8 @@ S3Tree, SSHTree, OSSTree, - WebdavTree, - WebdavsTree, + WebDAVTree, + WebDAVSTree, # NOTE: LocalTree is the default ] diff --git a/dvc/tree/webdav.py b/dvc/tree/webdav.py index daa0d2a33f..713d883822 100644 --- a/dvc/tree/webdav.py +++ b/dvc/tree/webdav.py @@ -3,8 +3,8 @@ from funcy import cached_property, wrap_prop -from dvc.exceptions import DvcException, WebdavConfigError -from dvc.path_info import WebdavURLInfo +from dvc.exceptions import DvcException, WebDAVConfigError +from dvc.path_info import WebDAVURLInfo from dvc.scheme import Schemes from .base import BaseTree @@ -13,12 +13,12 @@ logger = logging.getLogger(__name__) -class WebdavTree(BaseTree): # pylint:disable=abstract-method +class WebDAVTree(BaseTree): # pylint:disable=abstract-method # Use webdav scheme scheme = Schemes.WEBDAV # URLInfo for Webdav ~ replaces webdav -> http - PATH_CLS = WebdavURLInfo + PATH_CLS = WebDAVURLInfo # Non traversable as walk_files is not implemented CAN_TRAVERSE = False @@ -108,7 +108,7 @@ def _client(self): # Check whether client options are valid if not client.valid(): - raise WebdavConfigError(hostname) + raise WebDAVConfigError(hostname) # Return constructed client (cached) return client diff --git a/dvc/tree/webdavs.py b/dvc/tree/webdavs.py index 68e8891b27..570079507a 100644 --- a/dvc/tree/webdavs.py +++ b/dvc/tree/webdavs.py @@ -1,7 +1,7 @@ from dvc.scheme import Schemes -from .webdav import WebdavTree +from .webdav import WebDAVTree -class WebdavsTree(WebdavTree): # pylint:disable=abstract-method +class WebDAVSTree(WebDAVTree): # pylint:disable=abstract-method scheme = Schemes.WEBDAVS From f4d3d2e4220369db327aa5faa25bd3a2e371a7ff Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 22 Jul 2020 01:23:13 +0200 Subject: [PATCH 05/16] tree,remote: terminate WebDAV makedirs at self.path_info.path This enables the WebDAV api location (e.g. '/public.php/webdav') to be part of the remote 'url' configuration instead of beeing specified separately via the 'root' option. The 'root' option may then be used to specify real directories at the WebDAV storage, although using it to set the api location is still possible. Refs iterative#1153 --- dvc/tree/webdav.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/tree/webdav.py b/dvc/tree/webdav.py index 713d883822..b34d14480d 100644 --- a/dvc/tree/webdav.py +++ b/dvc/tree/webdav.py @@ -153,7 +153,7 @@ def remove(self, path_info): # Creates directories def makedirs(self, path_info): # Terminate recursion - if path_info.path == "/": + if path_info.path == self.path_info.path: return # Recursively descent to root From ee1813e6f73859048c11edf18c566914b8775df9 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 22 Jul 2020 14:16:18 +0200 Subject: [PATCH 06/16] tree,remote: use >=3.14.5 for WebDAV dependency webdavclient3 Context: https://github.com/iterative/dvc/pull/4256#discussion_r458490609 Refs iterative#1153 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 152e23cc48..85801b87c5 100644 --- a/setup.py +++ b/setup.py @@ -93,7 +93,7 @@ def run(self): oss = ["oss2==2.6.1"] ssh = ["paramiko>=2.5.0"] hdfs = ["pyarrow>=0.17.0"] -webdav = ["webdavclient3==3.14.5"] +webdav = ["webdavclient3>=3.14.5"] # gssapi should not be included in all_remotes, because it doesn't have wheels # for linux and mac, so it will fail to compile if user doesn't have all the # requirements, including kerberos itself. Once all the wheels are available, From 0e3a1772086091d66a330d2b1851cb38364ff3b4 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 22 Jul 2020 18:40:42 +0200 Subject: [PATCH 07/16] tree,remote: get rid of WebDAV 'root' option and add connection check The WebDAV 'root' option was rather confusing and should be handled by the initial 'path_info' from the config 'url' option. Context: https://github.com/iterative/dvc/pull/4256#discussion_r458754500 While stripping the path/root from the hostname the port got lost, which is fixed now by simply using the URLInfo 'replace' method as suggested. Context: https://github.com/iterative/dvc/pull/4256#discussion_r458842853 The WebDAV client connection is tested by probing the existence of the root (self.path_info.path). Refs iterative#1153 --- dvc/config.py | 1 - dvc/exceptions.py | 5 +++++ dvc/tree/webdav.py | 25 +++++++++++++------------ 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/dvc/config.py b/dvc/config.py index 416eb82eac..68cda1bece 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -123,7 +123,6 @@ class RelPath(str): "user": str, "password": str, "ask_password": Bool, - "root": str, "token": str, "cert_path": str, "key_path": str, diff --git a/dvc/exceptions.py b/dvc/exceptions.py index b5b49ba1db..866b0ab42f 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -304,6 +304,11 @@ def __init__(self, host): super().__init__(f"Configuration for WebDAV {host} is invalid.") +class WebDAVConnectionError(DvcException): + def __init__(self, host): + super().__init__(f"Unable to connect to WebDAV {host}.") + + class PathMissingError(DvcException): default_msg = ( "The path '{}' does not exist in the target repository '{}'" diff --git a/dvc/tree/webdav.py b/dvc/tree/webdav.py index b34d14480d..238fcc7414 100644 --- a/dvc/tree/webdav.py +++ b/dvc/tree/webdav.py @@ -3,8 +3,12 @@ from funcy import cached_property, wrap_prop -from dvc.exceptions import DvcException, WebDAVConfigError -from dvc.path_info import WebDAVURLInfo +from dvc.exceptions import ( + DvcException, + WebDAVConfigError, + WebDAVConnectionError, +) +from dvc.path_info import HTTPURLInfo, WebDAVURLInfo from dvc.scheme import Schemes from .base import BaseTree @@ -40,9 +44,6 @@ def __init__(self, repo, config): # Whether to ask for password is it is not set self.ask_password = config.get("ask_password", False) - # Webdav root directory - self.root = config.get("root", "/") - # Use token for webdav auth self.token = config.get("token", None) @@ -79,12 +80,9 @@ def _client(self): # Import the webdav client library from webdav3.client import Client - # Construct hostname from path_info - hostname = ( - self.path_info.scheme.replace("webdav", "http") - + "://" - + self.path_info.host - ) + # Construct hostname from path_info by stripping path + http_info = HTTPURLInfo(self.path_info.url) + hostname = http_info.replace(path="").url # Set password or ask for it if self.ask_password and self.password is None and self.token is None: @@ -97,7 +95,6 @@ def _client(self): "webdav_login": self.user, "webdav_password": self.password, "webdav_token": self.token, - "webdav_root": self.root, "webdav_cert_path": self.cert_path, "webdav_key_path": self.key_path, "webdav_timeout": self.timeout, @@ -110,6 +107,10 @@ def _client(self): if not client.valid(): raise WebDAVConfigError(hostname) + # Check whether connection is valid (root should always exist) + if not client.check(self.path_info.path): + raise WebDAVConnectionError(hostname) + # Return constructed client (cached) return client From 4ec60012f5787516078206d35e2a9c94db06d6a8 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Sun, 26 Jul 2020 23:21:13 +0200 Subject: [PATCH 08/16] tree,remote: implement walk_files for WebDAV Context: https://github.com/iterative/dvc/pull/4256#discussion_r458751391 Refs: iterative#1153 --- dvc/tree/webdav.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/dvc/tree/webdav.py b/dvc/tree/webdav.py index 238fcc7414..666f391333 100644 --- a/dvc/tree/webdav.py +++ b/dvc/tree/webdav.py @@ -1,5 +1,6 @@ import logging import threading +from collections import deque from funcy import cached_property, wrap_prop @@ -24,8 +25,11 @@ class WebDAVTree(BaseTree): # pylint:disable=abstract-method # URLInfo for Webdav ~ replaces webdav -> http PATH_CLS = WebDAVURLInfo - # Non traversable as walk_files is not implemented - CAN_TRAVERSE = False + # Traversable as walk_files is implemented + CAN_TRAVERSE = True + + # Length of walk_files prefix + TRAVERSE_PREFIX_LEN = 2 # Implementation based on webdav3.client REQUIRES = {"webdavclient3": "webdav3.client"} @@ -146,6 +150,33 @@ def isdir(self, path_info): # Use webdav is_dir to test whether path points to a directory return self._client.is_dir(path_info.path) + # Yields path info to all files + def walk_files(self, path_info, **kwargs): + # Check whether directory exists + if not self.exists(path_info): + return + + # Collect directories + dirs = deque([path_info.path]) + + # Iterate all directories found so far + while dirs: + # Nex directory + next_dir = path_info.replace(path=dirs.pop()) + + # Iterate directory content + for entry in self._client.list(next_dir.path): + # Construct path_info to entry + info = self.PATH_CLS(f"{next_dir.url}/{entry}") + + # Check whether entry is a directory + if self.isdir(info): + # Append new found directory to directory list + dirs.append(info.path) + else: + # Yield path info to non directory + yield info + # Removes file/directory def remove(self, path_info): # Use webdav client clean (DELETE) method to remove file/directory From e6c91fb6bcb2771a3518784b5f80ce483ee3ad2f Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Sun, 26 Jul 2020 23:49:17 +0200 Subject: [PATCH 09/16] tree,remote: let WebDAV client list query file info in walk_files Context: https://github.com/iterative/dvc/pull/4256#discussion_r458751391 Refs iterative#1153 --- dvc/tree/webdav.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/dvc/tree/webdav.py b/dvc/tree/webdav.py index 666f391333..d99b8ed5e7 100644 --- a/dvc/tree/webdav.py +++ b/dvc/tree/webdav.py @@ -161,16 +161,13 @@ def walk_files(self, path_info, **kwargs): # Iterate all directories found so far while dirs: - # Nex directory - next_dir = path_info.replace(path=dirs.pop()) - # Iterate directory content - for entry in self._client.list(next_dir.path): + for entry in self._client.list(dirs.pop(), get_info=True): # Construct path_info to entry - info = self.PATH_CLS(f"{next_dir.url}/{entry}") + info = path_info.replace(path=entry["path"]) # Check whether entry is a directory - if self.isdir(info): + if entry["isdir"]: # Append new found directory to directory list dirs.append(info.path) else: From f4c368a7ce953d99ce132148a9f59b51db7c11b7 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 27 Jul 2020 00:21:30 +0200 Subject: [PATCH 10/16] tree,remote: add some unit tests for WebDAVTree Refs iterative#1153 --- tests/unit/remote/test_webdav.py | 42 ++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 tests/unit/remote/test_webdav.py diff --git a/tests/unit/remote/test_webdav.py b/tests/unit/remote/test_webdav.py new file mode 100644 index 0000000000..7691f6b55b --- /dev/null +++ b/tests/unit/remote/test_webdav.py @@ -0,0 +1,42 @@ +from dvc.tree.webdav import WebDAVTree + +# Test configuration +url = "webdavs://example.com/public.php/webdav" +user = "username" +userurl = f"webdavs://{user}@example.com/public.php/webdav" +password = "password" + + +# Test minimum requiered configuration (url) +def test_init(dvc): + config = {"url": url} + tree = WebDAVTree(dvc, config) + + assert tree.path_info == url + + +# Test username from configuration +def test_user(dvc): + config = {"url": url, "user": user} + tree = WebDAVTree(dvc, config) + + assert tree.user == user + assert tree.path_info.user == user + + +# Test username extraction from url +def test_userurl(dvc): + config = {"url": userurl} + tree = WebDAVTree(dvc, config) + + assert tree.path_info == userurl + assert tree.user == user + assert tree.path_info.user == user + + +# test password from config +def test_password(dvc): + config = {"url": url, "user": user, "password": password} + tree = WebDAVTree(dvc, config) + + assert tree.password == password From 678b1e7d7568febce722d71a62b5d39a40e1bee6 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 27 Jul 2020 13:08:19 +0200 Subject: [PATCH 11/16] tree,remote: use ConfigError for WebDAVTree and move Error to webdav.py Context: https://github.com/iterative/dvc/pull/4256#discussion_r460591937 Refs iterative#1153 --- dvc/exceptions.py | 10 ---------- dvc/tree/webdav.py | 16 ++++++++++------ 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/dvc/exceptions.py b/dvc/exceptions.py index 866b0ab42f..204c378fdd 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -299,16 +299,6 @@ def __init__(self, code, reason): super().__init__(f"'{code} {reason}'") -class WebDAVConfigError(DvcException): - def __init__(self, host): - super().__init__(f"Configuration for WebDAV {host} is invalid.") - - -class WebDAVConnectionError(DvcException): - def __init__(self, host): - super().__init__(f"Unable to connect to WebDAV {host}.") - - class PathMissingError(DvcException): default_msg = ( "The path '{}' does not exist in the target repository '{}'" diff --git a/dvc/tree/webdav.py b/dvc/tree/webdav.py index d99b8ed5e7..7071ca59d8 100644 --- a/dvc/tree/webdav.py +++ b/dvc/tree/webdav.py @@ -4,11 +4,8 @@ from funcy import cached_property, wrap_prop -from dvc.exceptions import ( - DvcException, - WebDAVConfigError, - WebDAVConnectionError, -) +from dvc.config import ConfigError +from dvc.exceptions import DvcException from dvc.path_info import HTTPURLInfo, WebDAVURLInfo from dvc.scheme import Schemes @@ -18,6 +15,11 @@ logger = logging.getLogger(__name__) +class WebDAVConnectionError(DvcException): + def __init__(self, host): + super().__init__(f"Unable to connect to WebDAV {host}.") + + class WebDAVTree(BaseTree): # pylint:disable=abstract-method # Use webdav scheme scheme = Schemes.WEBDAV @@ -109,7 +111,9 @@ def _client(self): # Check whether client options are valid if not client.valid(): - raise WebDAVConfigError(hostname) + raise ConfigError( + f"Configuration for WebDAV {hostname} is invalid." + ) # Check whether connection is valid (root should always exist) if not client.check(self.path_info.path): From 5df4829deabbacb7077c35bcaa79ebd9e3107309 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 27 Jul 2020 13:29:53 +0200 Subject: [PATCH 12/16] tree,remote: remove/change some (unnecessary) comments Context: https://github.com/iterative/dvc/pull/4256#discussion_r460592181 --- dvc/tree/webdav.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/dvc/tree/webdav.py b/dvc/tree/webdav.py index 7071ca59d8..3baedab319 100644 --- a/dvc/tree/webdav.py +++ b/dvc/tree/webdav.py @@ -47,7 +47,7 @@ def __init__(self, repo, config): # Get password from configuration (might be None ~ not set) self.password = config.get("password", None) - # Whether to ask for password is it is not set + # Whether to ask for password if it is not set self.ask_password = config.get("ask_password", False) # Use token for webdav auth @@ -83,7 +83,6 @@ def __init__(self, repo, config): @wrap_prop(threading.Lock()) @cached_property def _client(self): - # Import the webdav client library from webdav3.client import Client # Construct hostname from path_info by stripping path @@ -106,7 +105,6 @@ def _client(self): "webdav_timeout": self.timeout, } - # Create a webdav client as configured client = Client(options) # Check whether client options are valid @@ -119,10 +117,9 @@ def _client(self): if not client.check(self.path_info.path): raise WebDAVConnectionError(hostname) - # Return constructed client (cached) return client - # Checks whether file exists + # Checks whether file/directory exists at remote def exists(self, path_info): # Use webdav check to test for file existence return self._client.check(path_info.path) From 68d6ab1977bc95d5c6dda3afb62b8509dc1d022a Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 27 Jul 2020 15:00:05 +0200 Subject: [PATCH 13/16] tree,remote: uploading to WebDAV only create directories if not exist --- dvc/tree/webdav.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/tree/webdav.py b/dvc/tree/webdav.py index 3baedab319..462db8af23 100644 --- a/dvc/tree/webdav.py +++ b/dvc/tree/webdav.py @@ -183,7 +183,7 @@ def remove(self, path_info): # Creates directories def makedirs(self, path_info): # Terminate recursion - if path_info.path == self.path_info.path: + if path_info.path == self.path_info.path or self.exists(path_info): return # Recursively descent to root From 4f9b34edbf16af7acb5ba413f30203e81716afe0 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 27 Jul 2020 15:26:49 +0200 Subject: [PATCH 14/16] tree,remote: add BaseTree parameter use_dvcignore to WebDAVTree exists Refs iterative/#1153 --- dvc/tree/webdav.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/tree/webdav.py b/dvc/tree/webdav.py index 462db8af23..acabf5ff52 100644 --- a/dvc/tree/webdav.py +++ b/dvc/tree/webdav.py @@ -120,7 +120,7 @@ def _client(self): return client # Checks whether file/directory exists at remote - def exists(self, path_info): + def exists(self, path_info, use_dvcignore=True): # Use webdav check to test for file existence return self._client.check(path_info.path) From d2c9d599e7aca0ba345ebb7ec4b18f3daac84126 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 27 Jul 2020 15:38:04 +0200 Subject: [PATCH 15/16] tree,remote: remove WebDAVTree copy method as proposed Context: https://github.com/iterative/dvc/pull/4256#discussion_r460892725 Refs iterative#1153 --- dvc/tree/webdav.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/dvc/tree/webdav.py b/dvc/tree/webdav.py index acabf5ff52..346b1b761a 100644 --- a/dvc/tree/webdav.py +++ b/dvc/tree/webdav.py @@ -197,11 +197,6 @@ def move(self, from_info, to_info, mode=None): # Webdav client move self._client.move(from_info.path, to_info.path) - # Copies file/directory at remote - def copy(self, from_info, to_info): - # Webdav client copy - self._client.copy(from_info.path, to_info.path) - # Downloads file from remote to file def _download(self, from_info, to_file, name=None, no_progress_bar=False): # pylint: disable=unused-argument From 5203770b15d10279515197d577bf6be2b5dab9bd Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 27 Jul 2020 19:17:04 +0200 Subject: [PATCH 16/16] tree,remote: add progress bar to WebDAV _download and _upload method Context: https://github.com/iterative/dvc/pull/4256#discussion_r460593130 Refs iterative#1153 --- dvc/tree/webdav.py | 54 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/dvc/tree/webdav.py b/dvc/tree/webdav.py index 346b1b761a..115de31767 100644 --- a/dvc/tree/webdav.py +++ b/dvc/tree/webdav.py @@ -1,4 +1,5 @@ import logging +import os import threading from collections import deque @@ -7,6 +8,7 @@ from dvc.config import ConfigError from dvc.exceptions import DvcException from dvc.path_info import HTTPURLInfo, WebDAVURLInfo +from dvc.progress import Tqdm from dvc.scheme import Schemes from .base import BaseTree @@ -36,6 +38,9 @@ class WebDAVTree(BaseTree): # pylint:disable=abstract-method # Implementation based on webdav3.client REQUIRES = {"webdavclient3": "webdav3.client"} + # Chunk size for buffered upload/download with progress bar + CHUNK_SIZE = 2 ** 16 + # Constructor def __init__(self, repo, config): # Call BaseTree constructor @@ -103,6 +108,7 @@ def _client(self): "webdav_cert_path": self.cert_path, "webdav_key_path": self.key_path, "webdav_timeout": self.timeout, + "webdav_chunk_size": self.CHUNK_SIZE, } client = Client(options) @@ -199,17 +205,49 @@ def move(self, from_info, to_info, mode=None): # Downloads file from remote to file def _download(self, from_info, to_file, name=None, no_progress_bar=False): - # pylint: disable=unused-argument - - # Webdav client download - self._client.download(from_info.path, to_file) + # Progress from HTTPTree + with open(to_file, "wb") as fd: + with Tqdm.wrapattr( + fd, + "write", + total=None if no_progress_bar else self._file_size(from_info), + leave=False, + desc=from_info.url if name is None else name, + disable=no_progress_bar, + ) as fd_wrapped: + # Download from WebDAV via buffer + self._client.download_from( + buff=fd_wrapped, remote_path=from_info.path + ) # Uploads file to remote def _upload(self, from_file, to_info, name=None, no_progress_bar=False): - # pylint: disable=unused-argument - # First try to create parent directories self.makedirs(to_info.parent) - # Now upload the file - self._client.upload(to_info.path, from_file) + # Progress from HTTPTree + def chunks(): + with open(from_file, "rb") as fd: + with Tqdm.wrapattr( + fd, + "read", + total=None + if no_progress_bar + else os.path.getsize(from_file), + leave=False, + desc=to_info.url if name is None else name, + disable=no_progress_bar, + ) as fd_wrapped: + while True: + chunk = fd_wrapped.read(self.CHUNK_SIZE) + if not chunk: + break + yield chunk + + # Upload to WebDAV via buffer + self._client.upload_to(buff=chunks(), remote_path=to_info.path) + + # Queries size of file at remote + def _file_size(self, path_info): + # Get file size from info dictionary and convert to int (from str) + return int(self._client.info(path_info.path)["size"])