Skip to content

Commit

Permalink
add support for minio, r2, gcs
Browse files Browse the repository at this point in the history
  • Loading branch information
samansmink committed Jul 12, 2024
1 parent a392ea5 commit 3e33b49
Show file tree
Hide file tree
Showing 9 changed files with 432 additions and 7 deletions.
87 changes: 87 additions & 0 deletions .github/workflows/LocalTesting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ jobs:
VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake
AZURE_STORAGE_CONNECTION_STRING: 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;'
AZURE_STORAGE_ACCOUNT: devstoreaccount1
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true

steps:
- uses: actions/checkout@v3
Expand Down Expand Up @@ -76,6 +77,92 @@ jobs:
echo "## azurite"
cat azurite_log.txt
minio-tests-linux:
name: Minio (local S3 test server) tests (Linux)
runs-on: ubuntu-latest
env:
S3_TEST_SERVER_AVAILABLE: 1
GEN: ninja
VCPKG_TARGET_TRIPLET: x64-linux
VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake

steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
submodules: 'true'

- name: Checkout DuckDB to version
if: ${{ matrix.duckdb_version != '<submodule_version>'}}
run: |
cd duckdb
git checkout ${{ matrix.duckdb_version }}
- uses: actions/setup-python@v4
with:
python-version: '3.11'

- name: Install Ninja
shell: bash
run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build

- name: Setup Ccache
uses: hendrikmuhs/ccache-action@main
with:
key: ${{ github.job }}
save: ${{ github.ref == 'refs/heads/main' || github.repository != 'duckdb/duckdb' }}

- name: Setup vcpkg
uses: lukka/run-vcpkg@v11.1
with:
vcpkgGitCommitId: a1a1cbc975abf909a6c8985a6a2b8fe20bbd9bd6

- name: Build
shell: bash
run: make

- name: Start S3/HTTP test server
shell: bash
run: |
cd duckdb
mkdir data/attach_test
touch data/attach_test/attach.db
sudo ./scripts/install_s3_test_server.sh
source ./scripts/run_s3_test_server.sh
sleep 30
- name: Write AWS credentials file
shell: bash
run: |
./scripts/create_minio_credential_file.sh
- name: Copy files to minio
shell: bash
env:
DUCKDB_MINIO_TEST_SERVER_AVAILABLE: 1
AWS_ACCESS_KEY_ID: minio_duckdb_user
AWS_SECRET_ACCESS_KEY: minio_duckdb_user_password
AWS_DEFAULT_REGION: eu-west-1
AWS_ENDPOINT: duckdb-minio.com:9000
run: |
./scripts/upload_test_files_to_minio.sh
- name: Test
shell: bash
run: |
make test
- name: Run Env tests
shell: bash
env:
DUCKDB_MINIO_TEST_SERVER_AVAILABLE: 1
AWS_ACCESS_KEY_ID: minio_duckdb_user
AWS_SECRET_ACCESS_KEY: minio_duckdb_user_password
AWS_DEFAULT_REGION: eu-west-1
AWS_ENDPOINT: duckdb-minio.com:9000
run: |
./build/release/test/unittest "*/test/sql/cloud/minio_local/*"
generated-tests-linux:
name: Generated Tests (Linux)
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion extension-ci-tools
7 changes: 7 additions & 0 deletions extension_config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@ duckdb_extension_load(azure
GIT_TAG 49b63dc8cd166952a0a34dfd54e6cfe5b823e05e
)

# Build the aws extension to test with credential providers
duckdb_extension_load(aws
LOAD_TESTS
GIT_URL https://github.com/duckdb/duckdb_aws
GIT_TAG 3d1f5c8d0127ff7aaf127935721b197e5fdd95e5
)

# Build the tpch and tpcds extension for testing/benchmarking
duckdb_extension_load(tpch)
duckdb_extension_load(tpcds)
43 changes: 43 additions & 0 deletions scripts/create_minio_credential_file.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash
# Warning: overwrites your existing aws credentials file!

# Set the file path for the credentials file
credentials_file=~/.aws/credentials

# Set the file path for the config file
config_file=~/.aws/config

# create dir if not already exists
mkdir -p ~/.aws

# Create the credentials configuration
credentials_str="[default]
aws_access_key_id=minio_duckdb_user
aws_secret_access_key=minio_duckdb_user_password
[minio-testing-2]
aws_access_key_id=minio_duckdb_user_2
aws_secret_access_key=minio_duckdb_user_2_password
[minio-testing-invalid]
aws_access_key_id=minio_duckdb_user_invalid
aws_secret_access_key=thispasswordiscompletelywrong
aws_session_token=completelybogussessiontoken
"

# Write the credentials configuration to the file
echo "$credentials_str" > "$credentials_file"

# Create the credentials configuration
config_str="[default]
region=eu-west-1
[profile minio-testing-2]
region=eu-west-1
[profile minio-testing-invalid]
region=the-moon-123
"

# Write the config to the file
echo "$config_str" > "$config_file"
4 changes: 4 additions & 0 deletions scripts/upload_test_files_to_minio.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

aws s3 cp --endpoint-url http://duckdb-minio.com:9000 --recursive ./build/release/rust/src/delta_kernel/acceptance/tests/dat/out/reader_tests/generated "s3://test-bucket/dat"
aws s3 cp --endpoint-url http://duckdb-minio.com:9000 --recursive ./build/release/rust/src/delta_kernel/acceptance/tests/dat/out/reader_tests/generated "s3://test-bucket-public/dat"
75 changes: 70 additions & 5 deletions src/functions/delta_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,14 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p
ffi::EngineBuilder* builder;

// For "regular" paths we early out with the default builder config
if (!StringUtil::StartsWith(path, "s3://") && !StringUtil::StartsWith(path, "azure://") && !StringUtil::StartsWith(path, "az://") && !StringUtil::StartsWith(path, "abfs://") && !StringUtil::StartsWith(path, "abfss://")) {
if (!StringUtil::StartsWith(path, "s3://") &&
!StringUtil::StartsWith(path, "gcs://") &&
!StringUtil::StartsWith(path, "gs://") &&
!StringUtil::StartsWith(path, "r2://") &&
!StringUtil::StartsWith(path, "azure://") &&
!StringUtil::StartsWith(path, "az://") &&
!StringUtil::StartsWith(path, "abfs://") &&
!StringUtil::StartsWith(path, "abfss://")) {
auto interface_builder_res = ffi::get_engine_builder(KernelUtils::ToDeltaString(path), DuckDBEngineError::AllocateError);
return KernelUtils::UnpackResult(interface_builder_res, "get_engine_interface_builder for path " + path);
}
Expand All @@ -130,6 +137,33 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p
bucket = path.substr(5, end_of_container-5);
path_in_bucket = path.substr(end_of_container);
secret_type = "s3";
} else if (StringUtil::StartsWith(path, "gcs://")) {
auto end_of_container = path.find('/',6);

if(end_of_container == string::npos) {
throw IOException("Invalid gcs url passed to delta scan: %s", path);
}
bucket = path.substr(6, end_of_container-6);
path_in_bucket = path.substr(end_of_container);
secret_type = "gcs";
} else if (StringUtil::StartsWith(path, "gs://")) {
auto end_of_container = path.find('/',5);

if(end_of_container == string::npos) {
throw IOException("Invalid gcs url passed to delta scan: %s", path);
}
bucket = path.substr(5, end_of_container-5);
path_in_bucket = path.substr(end_of_container);
secret_type = "gcs";
} else if (StringUtil::StartsWith(path, "r2://")) {
auto end_of_container = path.find('/',5);

if(end_of_container == string::npos) {
throw IOException("Invalid gcs url passed to delta scan: %s", path);
}
bucket = path.substr(5, end_of_container-5);
path_in_bucket = path.substr(end_of_container);
secret_type = "r2";
} else if ((StringUtil::StartsWith(path, "azure://")) || (StringUtil::StartsWith(path, "abfss://"))) {
auto end_of_container = path.find('/',8);

Expand Down Expand Up @@ -159,8 +193,18 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p
secret_type = "azure";
}

auto interface_builder_res = ffi::get_engine_builder(KernelUtils::ToDeltaString(path), DuckDBEngineError::AllocateError);
builder = KernelUtils::UnpackResult(interface_builder_res, "get_engine_interface_builder for path " + path);
// We need to substitute DuckDB's usage of s3 and r2 paths because delta kernel needs to just interpret them as s3 protocol servers.
string cleaned_path;
if (StringUtil::StartsWith(path, "r2://") || StringUtil::StartsWith(path, "gs://") ) {
cleaned_path = "s3://" + path.substr(5);
} else if (StringUtil::StartsWith(path, "gcs://")) {
cleaned_path = "s3://" + path.substr(6);
} else {
cleaned_path = path;
}

auto interface_builder_res = ffi::get_engine_builder(KernelUtils::ToDeltaString(cleaned_path), DuckDBEngineError::AllocateError);
builder = KernelUtils::UnpackResult(interface_builder_res, "get_engine_interface_builder for path " + cleaned_path);

// For S3 or Azure paths we need to trim the url, set the container, and fetch a potential secret
auto &secret_manager = SecretManager::Get(context);
Expand All @@ -170,18 +214,24 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p

// No secret: nothing left to do here!
if (!secret_match.HasMatch()) {
if (StringUtil::StartsWith(path, "r2://") || StringUtil::StartsWith(path, "gs://") || StringUtil::StartsWith(path, "gcs://")) {
throw NotImplementedException("Can not scan a gcs:// gs:// or r2:// url without a secret providing its endpoint currently. Please create an R2 or GCS secret containing the credentials for this endpoint and try again.");
}

return builder;
}
const auto &kv_secret = dynamic_cast<const KeyValueSecret &>(*secret_match.secret_entry->secret);


// Here you would need to add the logic for setting the builder options for Azure
// This is just a placeholder and will need to be replaced with the actual logic
if (secret_type == "s3") {
if (secret_type == "s3" || secret_type == "gcs" || secret_type == "r2") {
auto key_id = kv_secret.TryGetValue("key_id").ToString();
auto secret = kv_secret.TryGetValue("secret").ToString();
auto session_token = kv_secret.TryGetValue("session_token").ToString();
auto region = kv_secret.TryGetValue("region").ToString();
auto endpoint = kv_secret.TryGetValue("endpoint").ToString();
auto use_ssl = kv_secret.TryGetValue("use_ssl").ToString();
auto url_style = kv_secret.TryGetValue("url_style").ToString();

if (key_id.empty() && secret.empty()) {
ffi::set_builder_option(builder, KernelUtils::ToDeltaString("skip_signature"), KernelUtils::ToDeltaString("true"));
Expand All @@ -196,6 +246,21 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p
if (!session_token.empty()) {
ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_session_token"), KernelUtils::ToDeltaString(session_token));
}
if (!endpoint.empty() && endpoint != "s3.amazonaws.com") {
if(!StringUtil::StartsWith(endpoint, "https://") && !StringUtil::StartsWith(endpoint, "http://")) {
if(use_ssl == "1" || use_ssl == "NULL") {
endpoint = "https://" + endpoint;
} else {
endpoint = "http://" + endpoint;
}
}

if (StringUtil::StartsWith(endpoint, "http://")) {
ffi::set_builder_option(builder, KernelUtils::ToDeltaString("allow_http"), KernelUtils::ToDeltaString("true"));
}
ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_endpoint"), KernelUtils::ToDeltaString(endpoint));
}

ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_region"), KernelUtils::ToDeltaString(region));

} else if (secret_type == "azure") {
Expand Down
93 changes: 93 additions & 0 deletions test/sql/cloud/minio_local/gcs_r2.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# name: test/sql/cloud/minio_local/gcs_r2.test
# description: test delta extension with GCS and R2
# group: [aws]

require httpfs

require parquet

require delta

require aws

require-env DUCKDB_MINIO_TEST_SERVER_AVAILABLE

require-env AWS_ACCESS_KEY_ID

require-env AWS_SECRET_ACCESS_KEY

require-env AWS_DEFAULT_REGION

require-env AWS_ENDPOINT

statement ok
set secret_directory='__TEST_DIR__/minio_local_gcs_env'

statement error
FROM delta_scan('gcs://test-bucket/dat/all_primitive_types/delta')
----
Can not scan a gcs:// gs:// or r2:// url without a secret providing its endpoint currently. Please create an R2 or GCS secret containing the credentials for this endpoint and try again.

statement error
FROM delta_scan('gs://test-bucket/dat/all_primitive_types/delta')
----
Can not scan a gcs:// gs:// or r2:// url without a secret providing its endpoint currently. Please create an R2 or GCS secret containing the credentials for this endpoint and try again.

statement error
FROM delta_scan('r2://test-bucket/dat/all_primitive_types/delta')
----
Can not scan a gcs:// gs:// or r2:// url without a secret providing its endpoint currently. Please create an R2 or GCS secret containing the credentials for this endpoint and try again.

# create a fake gcs secret
statement ok
CREATE SECRET (
TYPE GCS,
KEY_ID '${AWS_ACCESS_KEY_ID}',
SECRET '${AWS_SECRET_ACCESS_KEY}',
REGION '${AWS_DEFAULT_REGION}',
ENDPOINT '${AWS_ENDPOINT}',
USE_SSL false
)

query I
SELECT int32
FROM delta_scan('gcs://test-bucket-public/dat/all_primitive_types/delta')
----
0
1
2
3
4

query I
SELECT int32
FROM delta_scan('gs://test-bucket-public/dat/all_primitive_types/delta')
----
0
1
2
3
4

# create a fake r2 secret
statement ok
CREATE SECRET s1 (
TYPE R2,
PROVIDER config,
account_id 'some_bogus_account',
KEY_ID '${AWS_ACCESS_KEY_ID}',
SECRET '${AWS_SECRET_ACCESS_KEY}',
REGION '${AWS_DEFAULT_REGION}',
ENDPOINT '${AWS_ENDPOINT}',
USE_SSL false
)

query I
SELECT int32
FROM delta_scan('r2://test-bucket-public/dat/all_primitive_types/delta')
----
0
1
2
3
4
Loading

0 comments on commit 3e33b49

Please sign in to comment.