-
Notifications
You must be signed in to change notification settings - Fork 1.3k
webhdfs: expose kerberos and https options #6936
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -179,10 +179,12 @@ class RelPath(str): | |
| }, | ||
| "hdfs": {"user": str, "kerb_ticket": str, **REMOTE_COMMON}, | ||
| "webhdfs": { | ||
| "hdfscli_config": str, | ||
| "webhdfs_token": str, | ||
| "user": str, | ||
| "webhdfs_alias": str, | ||
| "kerberos": Bool, | ||
| "kerberos_principal": str, | ||
| "proxy_to": str, | ||
| "ssl_verify": Any(Bool, str), | ||
| "token": str, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this makes sense, with the previous caveat about prefixes. |
||
| "use_https": Bool, | ||
|
Comment on lines
+185
to
+187
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If my understanding of the Hadoop docs is correct, then The question is if we want to create a new As Regarding the hadoop prefix see other comments. |
||
| **REMOTE_COMMON, | ||
| }, | ||
| "azure": { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,21 +1,54 @@ | ||
| from unittest.mock import Mock, create_autospec | ||
|
|
||
| import pytest | ||
| import requests | ||
|
|
||
| from dvc.fs.webhdfs import WebHDFSFileSystem | ||
|
|
||
| host = "host" | ||
| kerberos = False | ||
| kerberos_principal = "principal" | ||
| port = 12345 | ||
| proxy_to = "proxy" | ||
| ssl_verify = False | ||
| token = "token" | ||
| use_https = True | ||
| user = "test" | ||
| webhdfs_token = "token" | ||
| webhdfs_alias = "alias-name" | ||
| hdfscli_config = "path/to/cli/config" | ||
|
|
||
|
|
||
| def test_init(dvc): | ||
| url = "webhdfs://test@127.0.0.1:50070" | ||
| config = { | ||
| "host": url, | ||
| "webhdfs_token": webhdfs_token, | ||
| "webhdfs_alias": webhdfs_alias, | ||
| "hdfscli_config": hdfscli_config, | ||
| "user": user, | ||
|
|
||
|
|
||
| @pytest.fixture() | ||
| def webhdfs_config(): | ||
| url = f"webhdfs://{user}@{host}:{port}" | ||
| url_config = WebHDFSFileSystem._get_kwargs_from_urls(url) | ||
| return { | ||
| "kerberos": kerberos, | ||
| "kerberos_principal": kerberos_principal, | ||
| "proxy_to": proxy_to, | ||
| "ssl_verify": ssl_verify, | ||
| "token": token, | ||
| "use_https": use_https, | ||
| **url_config, | ||
| } | ||
|
|
||
| fs = WebHDFSFileSystem(**config) | ||
| assert fs.fs_args["token"] == webhdfs_token | ||
|
|
||
| def test_init(dvc, webhdfs_config): | ||
| fs = WebHDFSFileSystem(**webhdfs_config) | ||
| assert fs.fs_args["host"] == host | ||
| assert fs.fs_args["token"] == token | ||
| assert fs.fs_args["user"] == user | ||
| assert fs.fs_args["port"] == port | ||
| assert fs.fs_args["kerberos"] == kerberos | ||
| assert fs.fs_args["kerb_kwargs"] == {"principal": kerberos_principal} | ||
| assert fs.fs_args["proxy_to"] == proxy_to | ||
| assert fs.fs_args["use_https"] == use_https | ||
|
|
||
|
|
||
| def test_verify_ssl(dvc, webhdfs_config, monkeypatch): | ||
| mock_session = create_autospec(requests.Session) | ||
| monkeypatch.setattr(requests, "Session", Mock(return_value=mock_session)) | ||
| # can't have token at the same time as user or proxy_to | ||
| del webhdfs_config["token"] | ||
| fs = WebHDFSFileSystem(**webhdfs_config) | ||
| # ssl verify can't be set until after the file system is instantiated | ||
| fs.fs # pylint: disable=pointless-statement | ||
| assert mock_session.verify == ssl_verify |
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should it be
proxy_userorsuperuserper https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/Superusers.html ? Orhadoop_proxy_userto be precise."proxy to" could refer to many things...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes for this initial pull request I just copied the names of parameters from
fsspecand left them as a first draft to have a discussion around.I think
proxy_userwould be clearer, and in the hadoop docs I feel like I see proxy user more often than superuser.If we do something like
hadoop_proxy_userI feel like maybe we should prefix all the other options withhadooporwebhdfsas well to be consistent?It seems like the other DVC protocol config options do not do this kind of prefixing except google drive, but I can see how it would make sense.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @gudmundur-heimisson . It's an interesting question and I'm sure @efiop will know best what to do. I personally do like the prefixing idea