-
Notifications
You must be signed in to change notification settings - Fork 416
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(rust, python): add HDFS support via hdfs-native package (#2612)
# Description Add support for HDFS using [hdfs-native](https://github.com/Kimahriman/hdfs-native), a pure* Rust client for interacting with HDFS. Creates a new `hdfs` sub-crate, adds it as a feature to `deltalake` meta crate, and includes it in Python wheels by default. There is a Rust integration test that requires Hadoop and Java to be installed, and makes use of a small Maven program I ship under the `integration-test` feature flag to run a MiniDFS server. *Dynamically loads `libgssapi_krb5` using `libloading` for Kerberos support # Related Issue(s) <!--- For example: - closes #106 ---> Resolves #2611 # Documentation <!--- Share links to useful documentation --->
- Loading branch information
1 parent
a300218
commit d17ed97
Showing
12 changed files
with
223 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
[package] | ||
name = "deltalake-hdfs" | ||
version = "0.1.0" | ||
authors.workspace = true | ||
keywords.workspace = true | ||
readme.workspace = true | ||
edition.workspace = true | ||
homepage.workspace = true | ||
description.workspace = true | ||
license.workspace = true | ||
repository.workspace = true | ||
rust-version.workspace = true | ||
|
||
[dependencies] | ||
deltalake-core = { version = ">=0.17.0, <0.19.0", path = "../core" } | ||
hdfs-native-object-store = "0.11" | ||
|
||
# workspace dependecies | ||
object_store = { workspace = true } | ||
tokio = { workspace = true } | ||
url = { workspace = true } | ||
|
||
[dev-dependencies] | ||
serial_test = "3" | ||
deltalake-test = { path = "../test" } | ||
which = "4" | ||
|
||
[features] | ||
integration_test = ["hdfs-native-object-store/integration-test"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
use std::sync::Arc; | ||
|
||
use deltalake_core::logstore::{default_logstore, logstores, LogStore, LogStoreFactory}; | ||
use deltalake_core::storage::{ | ||
factories, url_prefix_handler, ObjectStoreFactory, ObjectStoreRef, StorageOptions, | ||
}; | ||
use deltalake_core::{DeltaResult, Path}; | ||
use hdfs_native_object_store::HdfsObjectStore; | ||
use url::Url; | ||
|
||
#[derive(Clone, Default, Debug)] | ||
pub struct HdfsFactory {} | ||
|
||
impl ObjectStoreFactory for HdfsFactory { | ||
fn parse_url_opts( | ||
&self, | ||
url: &Url, | ||
options: &StorageOptions, | ||
) -> DeltaResult<(ObjectStoreRef, Path)> { | ||
let store: ObjectStoreRef = Arc::new(HdfsObjectStore::with_config( | ||
url.as_str(), | ||
options.0.clone(), | ||
)?); | ||
let prefix = Path::parse(url.path())?; | ||
Ok((url_prefix_handler(store, prefix.clone()), prefix)) | ||
} | ||
} | ||
|
||
impl LogStoreFactory for HdfsFactory { | ||
fn with_options( | ||
&self, | ||
store: ObjectStoreRef, | ||
location: &Url, | ||
options: &StorageOptions, | ||
) -> DeltaResult<Arc<dyn LogStore>> { | ||
Ok(default_logstore(store, location, options)) | ||
} | ||
} | ||
|
||
/// Register an [ObjectStoreFactory] for common HDFS [Url] schemes | ||
pub fn register_handlers(_additional_prefixes: Option<Url>) { | ||
let factory = Arc::new(HdfsFactory {}); | ||
for scheme in ["hdfs", "viewfs"].iter() { | ||
let url = Url::parse(&format!("{}://", scheme)).unwrap(); | ||
factories().insert(url.clone(), factory.clone()); | ||
logstores().insert(url.clone(), factory.clone()); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
#![cfg(feature = "integration_test")] | ||
use deltalake_hdfs::register_handlers; | ||
use deltalake_test::utils::*; | ||
use hdfs_native_object_store::minidfs::MiniDfs; | ||
use std::{ | ||
collections::HashSet, | ||
process::{Command, ExitStatus}, | ||
}; | ||
|
||
use which::which; | ||
|
||
pub struct HdfsIntegration { | ||
minidfs: MiniDfs, | ||
} | ||
|
||
impl Default for HdfsIntegration { | ||
fn default() -> Self { | ||
register_handlers(None); | ||
let minidfs = MiniDfs::with_features(&HashSet::new()); | ||
Self { minidfs } | ||
} | ||
} | ||
|
||
impl StorageIntegration for HdfsIntegration { | ||
fn prepare_env(&self) { | ||
println!("Preparing env"); | ||
} | ||
|
||
fn create_bucket(&self) -> std::io::Result<ExitStatus> { | ||
let hadoop_exc = which("hadoop").expect("Failed to find hadoop executable"); | ||
|
||
Ok(Command::new(hadoop_exc) | ||
.args(["fs", "-mkdir", &self.root_uri()]) | ||
.status() | ||
.unwrap()) | ||
} | ||
|
||
fn bucket_name(&self) -> String { | ||
"/test-deltalake".to_string() | ||
} | ||
|
||
fn root_uri(&self) -> String { | ||
format!("{}{}", self.minidfs.url, self.bucket_name()) | ||
} | ||
|
||
fn copy_directory(&self, source: &str, destination: &str) -> std::io::Result<ExitStatus> { | ||
println!("Copy directory called with {} {}", source, destination); | ||
let hadoop_exc = which("hadoop").expect("Failed to find hadoop executable"); | ||
Ok(Command::new(hadoop_exc) | ||
.args([ | ||
"fs", | ||
"-copyFromLocal", | ||
"-p", | ||
source, | ||
&format!("{}/{}", self.root_uri(), destination), | ||
]) | ||
.status() | ||
.unwrap()) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#![cfg(feature = "integration_test")] | ||
use deltalake_test::{test_read_tables, IntegrationContext, TestResult}; | ||
use serial_test::serial; | ||
|
||
mod context; | ||
use context::*; | ||
|
||
#[tokio::test] | ||
#[serial] | ||
async fn test_read_tables_hdfs() -> TestResult { | ||
let context = IntegrationContext::new(Box::<HdfsIntegration>::default())?; | ||
|
||
test_read_tables(&context).await?; | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# HDFS Storage Backend | ||
HDFS support is provided via the [hdfs-native-object-store](https://github.com/datafusion-contrib/hdfs-native-object-store) package, which sits on top of [hdfs-native](https://github.com/Kimahriman/hdfs-native). This is an HDFS client written from scratch in Rust, with no bindings to libhdfs or any use of Java. While it supports most common cluster configurations, it does not support every possible client configuration that could exist. | ||
|
||
## Supported Configurations | ||
By default, the client looks for existing Hadoop configs in following manner: | ||
- If the `HADOOP_CONF_DIR` environment variable is defined, load configs from `$HADOOP_CONF_DIR/core-site.xml` and `$HADOOP_CONF_DIR/hdfs-site.xml` | ||
- Otherwise, if the `HADOOP_HOME` environment variable is set, load configs from `$HADOOP_HOME/etc/hadoop/core-site.xml` and `$HADOOP_HOME/etc/hadoop/hdfs-site.xml` | ||
|
||
Additionally, you can pass Hadoop configs as `storage_options` and these will take precedence over the above configs. | ||
|
||
Currently the supported client configuration parameters are: | ||
- `dfs.ha.namenodes.*` - name service support | ||
- `dfs.namenode.rpc-address.*` - name service support | ||
- `fs.viewfs.mounttable.*.link.*` - ViewFS links | ||
- `fs.viewfs.mounttable.*.linkFallback` - ViewFS link fallback | ||
|
||
If you find your setup is not supported, please file an issue in the [hdfs-native](https://github.com/Kimahriman/hdfs-native) repository. | ||
|
||
## Secure Clusters | ||
The client supports connecting to secure clusters through both Kerberos authentication as well as token authentication, and all SASL protection types are supported. The highest supported protection mechanism advertised by the server will be used. | ||
|
||
### Kerberos Support | ||
Kerberos is supported through dynamically loading the `libgssapi_krb5` library. This must be installed separately through your package manager, and currently only works on Linux and Mac. | ||
|
||
Debian-based systems: | ||
```bash | ||
apt-get install libgssapi-krb5-2 | ||
``` | ||
|
||
RHEL-based systems: | ||
```bash | ||
yum install krb5-libs | ||
``` | ||
|
||
MacOS: | ||
```bash | ||
brew install krb5 | ||
``` | ||
|
||
Then simply `kinit` to get your TGT and authentication to HDFS should just work. | ||
|
||
### Token Support | ||
Token authentication is supported by looking for a token file located at the environment variable `HADOOP_TOKEN_FILE_LOCATION`. This is the location systems like YARN will automatically place a delegation token, so things will just work inside of YARN jobs. | ||
|
||
## Issues | ||
If you face any HDFS-specific issues, please report to the [hdfs-native-object-store](https://github.com/datafusion-contrib/hdfs-native-object-store) repository. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters