From bf7f0ad2a9234fa250474178b499a9e3cfbee12b Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 14 Feb 2020 15:59:42 +0900 Subject: [PATCH 1/8] remote.http: support _upload() via HTTP POST - uploaded files are sent as chunked encoding POST data Fixes #3247 --- dvc/remote/http.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/dvc/remote/http.py b/dvc/remote/http.py index f3b3fb5a55..d7338b3d56 100644 --- a/dvc/remote/http.py +++ b/dvc/remote/http.py @@ -1,4 +1,5 @@ import logging +import os.path import threading from funcy import cached_property, wrap_prop @@ -48,6 +49,28 @@ def _download(self, from_info, to_file, name=None, no_progress_bar=False): fd.write(chunk) pbar.update(len(chunk)) + def _upload(self, from_file, to_info, name=None, no_progress_bar=False): + with Tqdm( + total=None if no_progress_bar else os.path.getsize(from_file), + leave=False, + bytes=True, + desc=to_info.url if name is None else name, + disable=no_progress_bar, + ) as pbar: + + def chunks(): + with open(from_file, "rb") as fd: + while True: + chunk = fd.read(self.CHUNK_SIZE) + if not chunk: + break + pbar.update(len(chunk)) + yield chunk + + response = self._request("POST", to_info.url, data=chunks()) + if response.status_code != 200: + raise HTTPError(response.status_code, response.reason) + def exists(self, path_info): return bool(self._request("HEAD", path_info.url)) From 3e73a59bdd0654cb98699fc3b255b34b57dddea4 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Mon, 17 Feb 2020 13:31:31 +0900 Subject: [PATCH 2/8] remote.http: add support for HTTP basic and digest auth - username, password, ask_password options work the same way as for ssh remotes - basic_auth (bool) and digest_auth (bool) options added for http(s) remotes - digest_auth takes precedence over basic_auth if both are enabled --- dvc/config.py | 11 +++++++++-- dvc/remote/http.py | 44 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/dvc/config.py b/dvc/config.py index d893855f71..98a3bba315 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -101,6 +101,13 @@ class RelPath(str): "shared": All(Lower, Choices("group")), Optional("slow_link_warning", default=True): Bool, } +HTTP_COMMON = { + "basic_auth": Bool, + "digest_auth": Bool, + "user": str, + "password": str, + "ask_password": Bool, +} SCHEMA = { "core": { "remote": Lower, @@ -169,8 +176,8 @@ class RelPath(str): "gdrive_user_credentials_file": str, **REMOTE_COMMON, }, - "http": REMOTE_COMMON, - "https": REMOTE_COMMON, + "http": {**HTTP_COMMON, **REMOTE_COMMON}, + "https": {**HTTP_COMMON, **REMOTE_COMMON}, "remote": {str: object}, # Any of the above options are valid } ) diff --git a/dvc/remote/http.py b/dvc/remote/http.py index d7338b3d56..bf540c5601 100644 --- a/dvc/remote/http.py +++ b/dvc/remote/http.py @@ -2,8 +2,9 @@ import os.path import threading -from funcy import cached_property, wrap_prop +from funcy import cached_property, memoize, wrap_prop, wrap_with +import dvc.prompt as prompt from dvc.config import ConfigError from dvc.exceptions import DvcException, HTTPError from dvc.progress import Tqdm @@ -13,6 +14,15 @@ logger = logging.getLogger(__name__) +@wrap_with(threading.Lock()) +@memoize +def ask_password(host, user): + return prompt.password( + "Enter a password for " + "host '{host}' user '{user}'".format(host=host, user=user) + ) + + class RemoteHTTP(RemoteBASE): scheme = Schemes.HTTP SESSION_RETRIES = 5 @@ -25,7 +35,13 @@ def __init__(self, repo, config): super().__init__(repo, config) url = config.get("url") - self.path_info = self.path_cls(url) if url else None + if url: + self.path_info = self.path_cls(url) + user = config.get("user", None) + if user: + self.path_info.user = user + else: + self.path_info = None if not self.no_traverse: raise ConfigError( @@ -33,6 +49,11 @@ def __init__(self, repo, config): "files. Use: `dvc remote modify no_traverse true`" ) + self.basic_auth = config.get("basic_auth", False) + self.digest_auth = config.get("digest_auth", False) + self.password = config.get("password", None) + self.ask_password = config.get("ask_password", False) + def _download(self, from_info, to_file, name=None, no_progress_bar=False): response = self._request("GET", from_info.url, stream=True) if response.status_code != 200: @@ -97,6 +118,21 @@ def get_file_checksum(self, path_info): return etag + def auth_method(self, path_info=None): + from requests.auth import HTTPBasicAuth, HTTPDigestAuth + + if path_info is None: + path_info = self.path_info + + if self.basic_auth or self.digest_auth: + if self.ask_password and self.password is None: + host, user = path_info.host, path_info.user + self.password = ask_password(host, user) + if self.digest_auth: + return HTTPDigestAuth(path_info.user, self.password) + return HTTPBasicAuth(path_info.user, self.password) + return None + @wrap_prop(threading.Lock()) @cached_property def _session(self): @@ -123,7 +159,9 @@ def _request(self, method, url, **kwargs): kwargs.setdefault("timeout", self.REQUEST_TIMEOUT) try: - res = self._session.request(method, url, **kwargs) + res = self._session.request( + method, url, auth=self.auth_method(), **kwargs, + ) redirect_no_location = ( kwargs["allow_redirects"] From c55fb3d7c4d4c9b1c39f8d842b0827ff9b898afc Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Mon, 17 Feb 2020 15:20:20 +0900 Subject: [PATCH 3/8] remote.http: auth_method() unit tests --- tests/unit/remote/test_http.py | 53 ++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tests/unit/remote/test_http.py b/tests/unit/remote/test_http.py index 20c8854fce..ea620103b5 100644 --- a/tests/unit/remote/test_http.py +++ b/tests/unit/remote/test_http.py @@ -27,3 +27,56 @@ def test_download_fails_on_error_code(dvc): with pytest.raises(HTTPError): remote._download(URLInfo(url) / "missing.txt", "missing.txt") + + +def test_public_auth_method(dvc): + config = { + "url": "http://example.com/", + "path_info": "file.html", + "user": "", + "password": "", + } + + remote = RemoteHTTP(dvc, config) + + assert remote.auth_method() is None + + +def test_basic_auth_method(dvc): + from requests.auth import HTTPBasicAuth + + user = "username" + password = "password" + auth = HTTPBasicAuth(user, password) + config = { + "url": "http://example.com/", + "path_info": "file.html", + "user": user, + "password": password, + "basic_auth": True, + } + + remote = RemoteHTTP(dvc, config) + + assert remote.auth_method() == auth + assert isinstance(remote.auth_method(), HTTPBasicAuth) + + +def test_digest_auth_method(dvc): + from requests.auth import HTTPDigestAuth + + user = "username" + password = "password" + auth = HTTPDigestAuth(user, password) + config = { + "url": "http://example.com/", + "path_info": "file.html", + "user": user, + "password": password, + "digest_auth": True, + } + + remote = RemoteHTTP(dvc, config) + + assert remote.auth_method() == auth + assert isinstance(remote.auth_method(), HTTPDigestAuth) From b45b9f7571e400b1b067aa18d49b9c964babe7a7 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Tue, 18 Feb 2020 15:35:23 +0900 Subject: [PATCH 4/8] tests: Add functional tests for HTTP remotes - HTTP remotes now tested locally using a SimpleHTTPServer instance that allows reading/writing to a temp directory --- tests/conftest.py | 7 +++++ tests/func/test_data_cloud.py | 15 +++++++++++ tests/remotes.py | 8 ++++++ tests/utils/httpd.py | 49 ++++++++++++++++++++++++++++++++++- 4 files changed, 78 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 6e01739aac..3af3225e64 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ import pytest from dvc.remote.ssh.connection import SSHConnection +from tests.utils.httpd import TempFileServer from .dir_helpers import * # noqa @@ -57,3 +58,9 @@ def _close_pools(): yield close_pools() + + +@pytest.fixture +def http_server(): + with TempFileServer() as httpd: + yield httpd diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index 39df54bbbf..b28c2e438b 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -30,6 +30,7 @@ GCP, GDrive, HDFS, + HTTP, Local, S3, SSHMocked, @@ -290,6 +291,20 @@ def _get_cloud_class(self): return RemoteHDFS +@pytest.mark.usefixtures("http_server") +class TestRemoteHTTP(HTTP, TestDataCloudBase): + @pytest.fixture(autouse=True) + def setup_method_fixture(self, request, http_server): + self.http_server = http_server + self.method_name = request.function.__name__ + + def get_url(self): + return super().get_url(self.http_server.server_port) + + def _get_cloud_class(self): + return RemoteHTTP + + class TestDataCloudCLIBase(TestDvc): def main(self, args): ret = main(args) diff --git a/tests/remotes.py b/tests/remotes.py index 2102ad9632..6964fc8a77 100644 --- a/tests/remotes.py +++ b/tests/remotes.py @@ -275,3 +275,11 @@ def get_url(): return "hdfs://{}@127.0.0.1{}".format( getpass.getuser(), Local.get_storagepath() ) + + +class HTTP: + should_test = always_test + + @staticmethod + def get_url(port): + return "http://127.0.0.1:{}".format(port) diff --git a/tests/utils/httpd.py b/tests/utils/httpd.py index 2a3091eb37..26e5f8d539 100644 --- a/tests/utils/httpd.py +++ b/tests/utils/httpd.py @@ -1,7 +1,11 @@ import hashlib import os +import shutil +import tempfile import threading -from http.server import HTTPServer +from functools import partial +from http import HTTPStatus +from http.server import HTTPServer, SimpleHTTPRequestHandler from RangeHTTPServer import RangeRequestHandler @@ -35,6 +39,35 @@ class ContentMD5Handler(TestRequestHandler): checksum_header = "Content-MD5" +class PushRequestHandler(SimpleHTTPRequestHandler): + def _chunks(self): + while True: + data = self.rfile.readline(65537) + chunk_size = int(data[:-2], 16) + if chunk_size == 0: + return + data = self.rfile.read(chunk_size) + yield data + self.rfile.read(2) + + def do_POST(self): + chunked = self.headers.get("Transfer-Encoding", "") == "chunked" + path = os.path.join(self.directory, self.translate_path(self.path)) + try: + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "wb") as fd: + if chunked: + for chunk in self._chunks(): + fd.write(chunk) + else: + size = int(self.headers.get("Content-Length", 0)) + fd.write(self.rfile.read(size)) + except Exception as e: + self.send_error(HTTPStatus.INTERNAL_SERVER_ERROR, str(e)) + self.send_response(HTTPStatus.OK) + self.end_headers() + + class StaticFileServer: _lock = threading.Lock() @@ -54,3 +87,17 @@ def __exit__(self, *args): self._httpd.shutdown() self._httpd.server_close() self._lock.release() + + +class TempFileServer(StaticFileServer): + def __init__(self): + self.directory = tempfile.mkdtemp() + handler_class = partial(PushRequestHandler, directory=self.directory) + super().__init__(handler_class=handler_class) + + def __exit__(self, *args): + super().__exit__(*args) + self.cleanup() + + def cleanup(self): + shutil.rmtree(self.directory) From 5a92a8b7ed59c9924da2e338ce81059faf259264 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Tue, 18 Feb 2020 17:38:49 +0900 Subject: [PATCH 5/8] tests: fix http tests on py3.5, 3.6 --- tests/conftest.py | 6 +++--- tests/utils/httpd.py | 21 ++------------------- 2 files changed, 5 insertions(+), 22 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 3af3225e64..a6a4c9ba8e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,7 +4,7 @@ import pytest from dvc.remote.ssh.connection import SSHConnection -from tests.utils.httpd import TempFileServer +from tests.utils.httpd import PushRequestHandler, StaticFileServer from .dir_helpers import * # noqa @@ -61,6 +61,6 @@ def _close_pools(): @pytest.fixture -def http_server(): - with TempFileServer() as httpd: +def http_server(tmp_dir): + with StaticFileServer(handler_class=PushRequestHandler) as httpd: yield httpd diff --git a/tests/utils/httpd.py b/tests/utils/httpd.py index 26e5f8d539..7cc75f2e60 100644 --- a/tests/utils/httpd.py +++ b/tests/utils/httpd.py @@ -1,9 +1,6 @@ import hashlib import os -import shutil -import tempfile import threading -from functools import partial from http import HTTPStatus from http.server import HTTPServer, SimpleHTTPRequestHandler from RangeHTTPServer import RangeRequestHandler @@ -52,7 +49,7 @@ def _chunks(self): def do_POST(self): chunked = self.headers.get("Transfer-Encoding", "") == "chunked" - path = os.path.join(self.directory, self.translate_path(self.path)) + path = self.translate_path(self.path) try: os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "wb") as fd: @@ -62,7 +59,7 @@ def do_POST(self): else: size = int(self.headers.get("Content-Length", 0)) fd.write(self.rfile.read(size)) - except Exception as e: + except (IOError, OSError) as e: self.send_error(HTTPStatus.INTERNAL_SERVER_ERROR, str(e)) self.send_response(HTTPStatus.OK) self.end_headers() @@ -87,17 +84,3 @@ def __exit__(self, *args): self._httpd.shutdown() self._httpd.server_close() self._lock.release() - - -class TempFileServer(StaticFileServer): - def __init__(self): - self.directory = tempfile.mkdtemp() - handler_class = partial(PushRequestHandler, directory=self.directory) - super().__init__(handler_class=handler_class) - - def __exit__(self, *args): - super().__exit__(*args) - self.cleanup() - - def cleanup(self): - shutil.rmtree(self.directory) From fa69951ee79f9e8d168ff1573b5430308c96366b Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Wed, 19 Feb 2020 13:04:52 +0900 Subject: [PATCH 6/8] remote.http: add custom auth method --- dvc/config.py | 4 ++-- dvc/remote/http.py | 20 ++++++++++++++------ tests/unit/remote/test_http.py | 22 ++++++++++++++++++++-- 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/dvc/config.py b/dvc/config.py index 98a3bba315..0d3292f374 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -102,8 +102,8 @@ class RelPath(str): Optional("slow_link_warning", default=True): Bool, } HTTP_COMMON = { - "basic_auth": Bool, - "digest_auth": Bool, + "auth": All(Lower, Choices("basic", "digest", "custom")), + "custom_header": str, "user": str, "password": str, "ask_password": Bool, diff --git a/dvc/remote/http.py b/dvc/remote/http.py index bf540c5601..ed480d7a83 100644 --- a/dvc/remote/http.py +++ b/dvc/remote/http.py @@ -49,10 +49,11 @@ def __init__(self, repo, config): "files. Use: `dvc remote modify no_traverse true`" ) - self.basic_auth = config.get("basic_auth", False) - self.digest_auth = config.get("digest_auth", False) + self.auth = config.get("auth", None) + self.custom_header = config.get("custom_header", None) self.password = config.get("password", None) self.ask_password = config.get("ask_password", False) + self.headers = {} def _download(self, from_info, to_file, name=None, no_progress_bar=False): response = self._request("GET", from_info.url, stream=True) @@ -124,13 +125,16 @@ def auth_method(self, path_info=None): if path_info is None: path_info = self.path_info - if self.basic_auth or self.digest_auth: + if self.auth: if self.ask_password and self.password is None: host, user = path_info.host, path_info.user self.password = ask_password(host, user) - if self.digest_auth: + if self.auth == "basic": + return HTTPBasicAuth(path_info.user, self.password) + elif self.auth == "digest": return HTTPDigestAuth(path_info.user, self.password) - return HTTPBasicAuth(path_info.user, self.password) + elif self.auth == "custom" and self.custom_header: + self.headers.update({self.custom_header: self.password}) return None @wrap_prop(threading.Lock()) @@ -160,7 +164,11 @@ def _request(self, method, url, **kwargs): try: res = self._session.request( - method, url, auth=self.auth_method(), **kwargs, + method, + url, + auth=self.auth_method(), + headers=self.headers, + **kwargs, ) redirect_no_location = ( diff --git a/tests/unit/remote/test_http.py b/tests/unit/remote/test_http.py index ea620103b5..3a4194285e 100644 --- a/tests/unit/remote/test_http.py +++ b/tests/unit/remote/test_http.py @@ -51,9 +51,9 @@ def test_basic_auth_method(dvc): config = { "url": "http://example.com/", "path_info": "file.html", + "auth": "basic", "user": user, "password": password, - "basic_auth": True, } remote = RemoteHTTP(dvc, config) @@ -71,12 +71,30 @@ def test_digest_auth_method(dvc): config = { "url": "http://example.com/", "path_info": "file.html", + "auth": "digest", "user": user, "password": password, - "digest_auth": True, } remote = RemoteHTTP(dvc, config) assert remote.auth_method() == auth assert isinstance(remote.auth_method(), HTTPDigestAuth) + + +def test_custom_auth_method(dvc): + header = "Custom Header" + password = "password" + config = { + "url": "http://example.com/", + "path_info": "file.html", + "auth": "custom", + "custom_header": header, + "password": password, + } + + remote = RemoteHTTP(dvc, config) + + assert remote.auth_method() is None + assert header in remote.headers + assert remote.headers[header] == password From c193be3b0c9125f60f5830de816a8067eb2cf75c Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Wed, 19 Feb 2020 14:22:51 +0900 Subject: [PATCH 7/8] s/custom_header/custom_auth_header/ --- dvc/config.py | 2 +- dvc/remote/http.py | 6 +++--- tests/unit/remote/test_http.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dvc/config.py b/dvc/config.py index 0d3292f374..6a59fff1c4 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -103,7 +103,7 @@ class RelPath(str): } HTTP_COMMON = { "auth": All(Lower, Choices("basic", "digest", "custom")), - "custom_header": str, + "custom_auth_header": str, "user": str, "password": str, "ask_password": Bool, diff --git a/dvc/remote/http.py b/dvc/remote/http.py index ed480d7a83..6f4e9b61da 100644 --- a/dvc/remote/http.py +++ b/dvc/remote/http.py @@ -50,7 +50,7 @@ def __init__(self, repo, config): ) self.auth = config.get("auth", None) - self.custom_header = config.get("custom_header", None) + self.custom_auth_header = config.get("custom_auth_header", None) self.password = config.get("password", None) self.ask_password = config.get("ask_password", False) self.headers = {} @@ -133,8 +133,8 @@ def auth_method(self, path_info=None): return HTTPBasicAuth(path_info.user, self.password) elif self.auth == "digest": return HTTPDigestAuth(path_info.user, self.password) - elif self.auth == "custom" and self.custom_header: - self.headers.update({self.custom_header: self.password}) + elif self.auth == "custom" and self.custom_auth_header: + self.headers.update({self.custom_auth_header: self.password}) return None @wrap_prop(threading.Lock()) diff --git a/tests/unit/remote/test_http.py b/tests/unit/remote/test_http.py index 3a4194285e..65ad2d2e85 100644 --- a/tests/unit/remote/test_http.py +++ b/tests/unit/remote/test_http.py @@ -83,13 +83,13 @@ def test_digest_auth_method(dvc): def test_custom_auth_method(dvc): - header = "Custom Header" + header = "Custom-Header" password = "password" config = { "url": "http://example.com/", "path_info": "file.html", "auth": "custom", - "custom_header": header, + "custom_auth_header": header, "password": password, } From b893874fd26c75b630aed9a40698dd8ce02f8603 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20Rowlands=20=28=EB=B3=80=EA=B8=B0=ED=98=B8=29?= Date: Thu, 20 Feb 2020 09:06:24 +0900 Subject: [PATCH 8/8] Apply suggestions from code review Co-Authored-By: Saugat Pachhai Co-Authored-By: Ruslan Kuprieiev --- dvc/remote/http.py | 6 +++--- tests/utils/httpd.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dvc/remote/http.py b/dvc/remote/http.py index 6f4e9b61da..d0f35029bb 100644 --- a/dvc/remote/http.py +++ b/dvc/remote/http.py @@ -90,7 +90,7 @@ def chunks(): yield chunk response = self._request("POST", to_info.url, data=chunks()) - if response.status_code != 200: + if response.status_code not in (200, 201): raise HTTPError(response.status_code, response.reason) def exists(self, path_info): @@ -131,9 +131,9 @@ def auth_method(self, path_info=None): self.password = ask_password(host, user) if self.auth == "basic": return HTTPBasicAuth(path_info.user, self.password) - elif self.auth == "digest": + if self.auth == "digest": return HTTPDigestAuth(path_info.user, self.password) - elif self.auth == "custom" and self.custom_auth_header: + if self.auth == "custom" and self.custom_auth_header: self.headers.update({self.custom_auth_header: self.password}) return None diff --git a/tests/utils/httpd.py b/tests/utils/httpd.py index 7cc75f2e60..378bb75b3f 100644 --- a/tests/utils/httpd.py +++ b/tests/utils/httpd.py @@ -59,7 +59,7 @@ def do_POST(self): else: size = int(self.headers.get("Content-Length", 0)) fd.write(self.rfile.read(size)) - except (IOError, OSError) as e: + except OSError as e: self.send_error(HTTPStatus.INTERNAL_SERVER_ERROR, str(e)) self.send_response(HTTPStatus.OK) self.end_headers()