-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
webhdfs: expose kerberos and https options #6936
Changes from 2 commits
99a9770
60315d8
50f639c
2883507
5d50107
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -179,10 +179,12 @@ class RelPath(str): | |
}, | ||
"hdfs": {"user": str, "kerb_ticket": str, **REMOTE_COMMON}, | ||
"webhdfs": { | ||
"hdfscli_config": str, | ||
"webhdfs_token": str, | ||
"user": str, | ||
"webhdfs_alias": str, | ||
"kerberos": Bool, | ||
"kerberos_principal": str, | ||
"proxy_to": str, | ||
"ssl_verify": Any(Bool, str), | ||
"token": str, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this makes sense, with the previous caveat about prefixes. |
||
"use_https": Bool, | ||
Comment on lines
+185
to
+187
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If my understanding of the Hadoop docs is correct, then The question is if we want to create a new As Regarding the hadoop prefix see other comments. |
||
**REMOTE_COMMON, | ||
}, | ||
"azure": { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,54 @@ | ||
from unittest.mock import Mock, create_autospec | ||
|
||
import pytest | ||
import requests | ||
|
||
from dvc.fs.webhdfs import WebHDFSFileSystem | ||
|
||
host = "host" | ||
kerberos = False | ||
kerberos_principal = "principal" | ||
port = 12345 | ||
proxy_to = "proxy" | ||
ssl_verify = False | ||
token = "token" | ||
use_https = True | ||
user = "test" | ||
webhdfs_token = "token" | ||
webhdfs_alias = "alias-name" | ||
hdfscli_config = "path/to/cli/config" | ||
|
||
|
||
def test_init(dvc): | ||
url = "webhdfs://test@127.0.0.1:50070" | ||
config = { | ||
"host": url, | ||
"webhdfs_token": webhdfs_token, | ||
"webhdfs_alias": webhdfs_alias, | ||
"hdfscli_config": hdfscli_config, | ||
"user": user, | ||
|
||
|
||
@pytest.fixture() | ||
def webhdfs_config(): | ||
url = f"webhdfs://{user}@{host}:{port}" | ||
url_config = WebHDFSFileSystem._get_kwargs_from_urls(url) | ||
return { | ||
"kerberos": kerberos, | ||
"kerberos_principal": kerberos_principal, | ||
"proxy_to": proxy_to, | ||
"ssl_verify": ssl_verify, | ||
"token": token, | ||
"use_https": use_https, | ||
**url_config, | ||
} | ||
|
||
fs = WebHDFSFileSystem(**config) | ||
assert fs.fs_args["token"] == webhdfs_token | ||
|
||
def test_init(dvc, webhdfs_config): | ||
fs = WebHDFSFileSystem(**webhdfs_config) | ||
assert fs.fs_args["host"] == host | ||
assert fs.fs_args["token"] == token | ||
assert fs.fs_args["user"] == user | ||
assert fs.fs_args["port"] == port | ||
assert fs.fs_args["kerberos"] == kerberos | ||
assert fs.fs_args["kerb_kwargs"] == {"principal": kerberos_principal} | ||
assert fs.fs_args["proxy_to"] == proxy_to | ||
assert fs.fs_args["use_https"] == use_https | ||
|
||
|
||
def test_verify_ssl(dvc, webhdfs_config, monkeypatch): | ||
mock_session = create_autospec(requests.Session) | ||
monkeypatch.setattr(requests, "Session", Mock(return_value=mock_session)) | ||
# can't have token at the same time as user or proxy_to | ||
del webhdfs_config["token"] | ||
fs = WebHDFSFileSystem(**webhdfs_config) | ||
# ssl verify can't be set until after the file system is instantiated | ||
fs.fs # pylint: disable=pointless-statement | ||
assert mock_session.verify == ssl_verify |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should it be
proxy_user
orsuperuser
per https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/Superusers.html ? Orhadoop_proxy_user
to be precise."proxy to" could refer to many things...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes for this initial pull request I just copied the names of parameters from
fsspec
and left them as a first draft to have a discussion around.I think
proxy_user
would be clearer, and in the hadoop docs I feel like I see proxy user more often than superuser.If we do something like
hadoop_proxy_user
I feel like maybe we should prefix all the other options withhadoop
orwebhdfs
as well to be consistent?It seems like the other DVC protocol config options do not do this kind of prefixing except google drive, but I can see how it would make sense.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @gudmundur-heimisson . It's an interesting question and I'm sure @efiop will know best what to do. I personally do like the prefixing idea