diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index c702df258f8..fcebe39571b 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -47,7 +47,7 @@ jobs: run: cargo clippy --all-targets -- -D warnings build-and-test-java: - runs-on: ubuntu-24.04 + runs-on: warp-ubuntu-latest-x64-4x timeout-minutes: 60 strategy: matrix: diff --git a/.github/workflows/run_tests/action.yml b/.github/workflows/run_tests/action.yml index 14c4b3d6f46..1800ce614fe 100644 --- a/.github/workflows/run_tests/action.yml +++ b/.github/workflows/run_tests/action.yml @@ -12,6 +12,9 @@ inputs: runs: using: "composite" steps: + - name: Setup MSVC for torch.compile + if: runner.os == 'Windows' + uses: ilammy/msvc-dev-cmd@v1 - name: Install dependencies working-directory: python shell: bash diff --git a/Cargo.lock b/Cargo.lock index e82504e3f32..f7263fb0748 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "RustyXML" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5" + [[package]] name = "addr2line" version = "0.25.1" @@ -430,6 +436,17 @@ dependencies = [ "serde_json", ] +[[package]] +name = "async-channel" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" +dependencies = [ + "concurrent-queue", + "event-listener 2.5.3", + "futures-core", +] + [[package]] name = "async-channel" version = "2.5.0" @@ -459,17 +476,53 @@ dependencies = [ "zstd-safe", ] +[[package]] +name = "async-io" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "456b8a8feb6f42d237746d4b3e9a178494627745c3c56c6ea55d92ba50d026fc" +dependencies = [ + "autocfg", + "cfg-if", + "concurrent-queue", + "futures-io", + "futures-lite 2.6.1", + "parking", + "polling", + "rustix 1.1.3", + "slab", + "windows-sys 0.61.2", +] + [[package]] name = "async-lock" version = "3.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f7f2596bd5b78a9fec8088ccd89180d7f9f55b94b0576823bbbdc72ee8311" dependencies = [ - "event-listener", + "event-listener 5.4.1", "event-listener-strategy", "pin-project-lite", ] +[[package]] +name = "async-process" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc50921ec0055cdd8a16de48773bfeec5c972598674347252c0399676be7da75" +dependencies = [ + "async-channel 2.5.0", + "async-io", + "async-lock", + "async-signal", + "async-task", + "blocking", + "cfg-if", + "event-listener 5.4.1", + "futures-lite 2.6.1", + "rustix 1.1.3", +] + [[package]] name = "async-recursion" version = "1.1.1" @@ -481,6 +534,30 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "async-signal" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43c070bbf59cd3570b6b2dd54cd772527c7c3620fce8be898406dd3ed6adc64c" +dependencies = [ + "async-io", + "async-lock", + "atomic-waker", + "cfg-if", + "futures-core", + "futures-io", + "rustix 1.1.3", + "signal-hook-registry", + "slab", + "windows-sys 0.61.2", +] + +[[package]] +name = "async-task" +version = "4.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" + [[package]] name = "async-trait" version = "0.1.89" @@ -541,7 +618,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "hex", "http 1.4.0", "ring", @@ -566,9 +643,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.15.2" +version = "1.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a88aab2464f1f25453baa7a07c84c5b7684e274054ba06817f382357f77a288" +checksum = "e84ce723ab67259cfeb9877c6a639ee9eb7a27b28123abd71db7f0d5d0cc9d86" dependencies = [ "aws-lc-sys", "zeroize", @@ -576,9 +653,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.35.0" +version = "0.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b45afffdee1e7c9126814751f88dddc747f41d91da16c9551a0f1e8a11e788a1" +checksum = "43a442ece363113bd4bd4c8b18977a7798dd4d3c3383f34fb61936960e8f4ad8" dependencies = [ "cc", "cmake", @@ -588,9 +665,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.17" +version = "1.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d81b5b2898f6798ad58f484856768bca817e3cd9de0974c24ae0f1113fe88f1b" +checksum = "959dab27ce613e6c9658eb3621064d0e2027e5f2acb65bc526a43577facea557" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -602,7 +679,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "http-body 0.4.6", "percent-encoding", @@ -613,21 +690,22 @@ dependencies = [ [[package]] name = "aws-sdk-dynamodb" -version = "1.101.0" +version = "1.102.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6f98cd9e5f2fc790aff1f393bc3c8680deea31c05d3c6f23b625cdc50b1b6b4" +checksum = "f5f7e6a53cf5ee8b7041c73106d9a93480b47f8b955466262b043aab0b5bf489" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -635,9 +713,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "1.119.0" +version = "1.120.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d65fddc3844f902dfe1864acb8494db5f9342015ee3ab7890270d36fbd2e01c" +checksum = "06673901e961f20fa8d7da907da48f7ad6c1b383e3726c22bd418900f015abe1" dependencies = [ "aws-credential-types", "aws-runtime", @@ -647,19 +725,20 @@ dependencies = [ "aws-smithy-eventstream", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-smithy-xml", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "hex", "hmac", "http 0.2.12", "http 1.4.0", "http-body 0.4.6", - "lru", + "lru 0.16.3", "percent-encoding", "regex-lite", "sha2", @@ -669,21 +748,22 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.91.0" +version = "1.92.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ee6402a36f27b52fe67661c6732d684b2635152b676aa2babbfb5204f99115d" +checksum = "b7d63bd2bdeeb49aa3f9b00c15e18583503b778b2e792fc06284d54e7d5b6566" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -691,21 +771,22 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.93.0" +version = "1.94.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a45a7f750bbd170ee3677671ad782d90b894548f4e4ae168302c57ec9de5cb3e" +checksum = "532d93574bf731f311bafb761366f9ece345a0416dbcc273d81d6d1a1205239b" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -713,22 +794,23 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.95.0" +version = "1.96.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55542378e419558e6b1f398ca70adb0b2088077e79ad9f14eb09441f2f7b2164" +checksum = "357e9a029c7524db6a0099cd77fbd5da165540339e7296cca603531bc783b56c" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-smithy-xml", "aws-types", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -775,9 +857,9 @@ dependencies = [ [[package]] name = "aws-smithy-checksums" -version = "0.63.12" +version = "0.63.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87294a084b43d649d967efe58aa1f9e0adc260e13a6938eb904c0ae9b45824ae" +checksum = "23374b9170cbbcc6f5df8dc5ebb9b6c5c28a3c8f599f0e8b8b10eb6f4a5c6e74" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -867,9 +949,9 @@ dependencies = [ [[package]] name = "aws-smithy-observability" -version = "0.1.5" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f616c3f2260612fe44cede278bafa18e73e6479c4e393e2c4518cf2a9a228a" +checksum = "ef1fcbefc7ece1d70dcce29e490f269695dfca2d2bacdeaf9e5c3f799e4e6a42" dependencies = [ "aws-smithy-runtime-api", ] @@ -886,9 +968,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.9.5" +version = "1.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a392db6c583ea4a912538afb86b7be7c5d8887d91604f50eb55c262ee1b4a5f5" +checksum = "bb5b6167fcdf47399024e81ac08e795180c576a20e4d4ce67949f9a88ae37dc1" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -897,7 +979,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "http 1.4.0", "http-body 0.4.6", @@ -910,9 +992,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.9.3" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab0d43d899f9e508300e587bf582ba54c27a452dd0a9ea294690669138ae14a2" +checksum = "efce7aaaf59ad53c5412f14fc19b2d5c6ab2c3ec688d272fd31f76ec12f44fb0" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -927,9 +1009,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.3.5" +version = "1.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "905cb13a9895626d49cf2ced759b062d913834c7482c38e49557eac4e6193f01" +checksum = "65f172bcb02424eb94425db8aed1b6d583b5104d4d5ddddf22402c661a320048" dependencies = [ "base64-simd", "bytes", @@ -1029,13 +1111,120 @@ dependencies = [ "tracing", ] +[[package]] +name = "azure_core" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b552ad43a45a746461ec3d3a51dfb6466b4759209414b439c165eb6a6b7729e" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes", + "dyn-clone", + "futures", + "getrandom 0.2.17", + "hmac", + "http-types", + "once_cell", + "paste", + "pin-project", + "quick-xml 0.31.0", + "rand 0.8.5", + "reqwest", + "rustc_version", + "serde", + "serde_json", + "sha2", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_identity" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ddd80344317c40c04b603807b63a5cefa532f1b43522e72f480a988141f744" +dependencies = [ + "async-lock", + "async-process", + "async-trait", + "azure_core", + "futures", + "oauth2", + "pin-project", + "serde", + "time", + "tracing", + "tz-rs", + "url", + "uuid", +] + +[[package]] +name = "azure_storage" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59f838159f4d29cb400a14d9d757578ba495ae64feb07a7516bf9e4415127126" +dependencies = [ + "RustyXML", + "async-lock", + "async-trait", + "azure_core", + "bytes", + "serde", + "serde_derive", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_storage_blobs" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97e83c3636ae86d9a6a7962b2112e3b19eb3903915c50ce06ff54ff0a2e6a7e4" +dependencies = [ + "RustyXML", + "azure_core", + "azure_storage", + "azure_svc_blobstorage", + "bytes", + "futures", + "serde", + "serde_derive", + "serde_json", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_svc_blobstorage" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e6c6f20c5611b885ba94c7bae5e02849a267381aecb8aee577e8c35ff4064c6" +dependencies = [ + "azure_core", + "bytes", + "futures", + "log", + "once_cell", + "serde", + "serde_json", + "time", +] + [[package]] name = "backon" version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cffb0e931875b666fc4fcb20fee52e9bbd1ef836fd9e9e04ec21555f9f85f7ef" dependencies = [ - "fastrand", + "fastrand 2.3.0", "gloo-timers", "tokio", ] @@ -1217,6 +1406,19 @@ dependencies = [ "generic-array", ] +[[package]] +name = "blocking" +version = "1.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e83f8d02be6967315521be875afa792a316e28d57b5a2d401897e2a7921b7f21" +dependencies = [ + "async-channel 2.5.0", + "async-task", + "futures-io", + "futures-lite 2.6.1", + "piper", +] + [[package]] name = "bon" version = "3.8.2" @@ -1342,9 +1544,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.52" +version = "1.2.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd4932aefd12402b36c60956a4fe0035421f544799057659ff86f923657aada3" +checksum = "755d2fce177175ffca841e9a06afdb2c4ab0f593d53b4dee48147dfaade85932" dependencies = [ "find-msvc-tools", "jobserver", @@ -1381,9 +1583,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.42" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" +checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" dependencies = [ "iana-time-zone", "js-sys", @@ -1554,6 +1756,12 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "const_fn" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f8a2ca5ac02d09563609681103aada9e1777d54fc57a5acd7a41404f9c93b6e" + [[package]] name = "constant_time_eq" version = "0.4.2" @@ -1615,9 +1823,9 @@ dependencies = [ [[package]] name = "crc" -version = "3.4.0" +version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d" +checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675" dependencies = [ "crc-catalog", ] @@ -1630,15 +1838,14 @@ checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" [[package]] name = "crc-fast" -version = "1.6.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ddc2d09feefeee8bd78101665bd8645637828fa9317f9f292496dbbd8c65ff3" +checksum = "2fd92aca2c6001b1bf5ba0ff84ee74ec8501b52bbef0cac80bf25a6c1d87a83d" dependencies = [ "crc", "digest", - "rand 0.9.2", - "regex", "rustversion", + "spin 0.10.0", ] [[package]] @@ -3040,6 +3247,12 @@ version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca81e6b4777c89fd810c25a4be2b1bd93ea034fbe58e6a75216a34c6b82c539b" +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + [[package]] name = "event-listener" version = "5.4.1" @@ -3057,7 +3270,7 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" dependencies = [ - "event-listener", + "event-listener 5.4.1", "pin-project-lite", ] @@ -3073,6 +3286,15 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9afc2bd4d5a73106dd53d10d73d3401c2f32730ba2c0b93ddb888a8983680471" +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -3091,21 +3313,20 @@ dependencies = [ [[package]] name = "filetime" -version = "0.2.26" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc0505cd1b6fa6580283f6bdf70a73fcf4aba1184038c90902b92b3dd0df63ed" +checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" dependencies = [ "cfg-if", "libc", "libredox", - "windows-sys 0.60.2", ] [[package]] name = "find-msvc-tools" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f449e6c6c08c865631d4890cfacf252b3d396c9bcc83adb6623cdb02a8336c41" +checksum = "8591b0bcc8a98a64310a2fae1bb3e9b8564dd10e381e6e28010fde8e8e8568db" [[package]] name = "findshlibs" @@ -3291,6 +3512,34 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +[[package]] +name = "futures-lite" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" +dependencies = [ + "fastrand 1.9.0", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + +[[package]] +name = "futures-lite" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f78e10609fe0e0b3f4157ffab1876319b5b0db102a2c60dc4626306dc46b44ad" +dependencies = [ + "fastrand 2.3.0", + "futures-core", + "futures-io", + "parking", + "pin-project-lite", +] + [[package]] name = "futures-macro" version = "0.3.31" @@ -3485,6 +3734,17 @@ dependencies = [ "libm", ] +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -3494,7 +3754,7 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi", + "wasi 0.11.1+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -3536,6 +3796,26 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "google-cloud-auth" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5572275b7f06b6fde8eec61a23d87c83aae362bee586bbeb8773b3f98658ae81" +dependencies = [ + "async-trait", + "base64 0.22.1", + "derive_builder 0.20.2", + "http 1.4.0", + "reqwest", + "rustls 0.23.36", + "rustls-pemfile", + "serde", + "serde_json", + "thiserror 2.0.18", + "time", + "tokio", +] + [[package]] name = "group" version = "0.12.1" @@ -3684,7 +3964,7 @@ dependencies = [ "reqwest", "serde", "serde_json", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "ureq", "windows-sys 0.60.2", @@ -3769,6 +4049,26 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-types" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e9b187a72d63adbfba487f48095306ac823049cb504ee195541e91c7775f5ad" +dependencies = [ + "anyhow", + "async-channel 1.9.0", + "base64 0.13.1", + "futures-lite 1.13.0", + "infer", + "pin-project-lite", + "rand 0.7.3", + "serde", + "serde_json", + "serde_qs", + "serde_urlencoded", + "url", +] + [[package]] name = "httparse" version = "1.10.1" @@ -4153,6 +4453,12 @@ dependencies = [ "web-time", ] +[[package]] +name = "infer" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64e9829a50b42bb782c1df523f78d332fe371b10c661e78b7a3c34b0198e9fac" + [[package]] name = "inferno" version = "0.11.21" @@ -4181,6 +4487,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", +] + [[package]] name = "integer-encoding" version = "3.0.4" @@ -4347,9 +4662,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.83" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "464a3709c7f55f1f721e5389aa6ea4e3bc6aba669353300af094b29ffbdde1d8" +checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" dependencies = [ "once_cell", "wasm-bindgen", @@ -4741,7 +5056,7 @@ dependencies = [ "arrow-ord", "arrow-schema", "arrow-select", - "async-channel", + "async-channel 2.5.0", "async-recursion", "async-trait", "bitpacking", @@ -4892,9 +5207,19 @@ dependencies = [ "arrow-ipc", "arrow-schema", "async-trait", + "aws-config", + "aws-credential-types", + "aws-sdk-sts", "axum", + "azure_core", + "azure_identity", + "azure_storage", + "azure_storage_blobs", + "base64 0.22.1", "bytes", + "chrono", "futures", + "google-cloud-auth", "lance", "lance-core", "lance-index", @@ -4907,8 +5232,10 @@ dependencies = [ "rstest", "serde", "serde_json", + "sha2", "snafu", "tempfile", + "time", "tokio", "tower", "tower-http 0.5.2", @@ -4918,9 +5245,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.0.18" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ea349999bcda4eea53fc05d334b3775ec314761e6a706555c777d7a29b18d19" +checksum = "a2acdba67f84190067532fce07b51a435dd390d7cdc1129a05003e5cb3274cf0" dependencies = [ "reqwest", "serde", @@ -5221,7 +5548,7 @@ dependencies = [ "reqwest", "serde", "tar", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "yada", ] @@ -5344,6 +5671,15 @@ dependencies = [ "hashbrown 0.15.5", ] +[[package]] +name = "lru" +version = "0.16.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" +dependencies = [ + "hashbrown 0.16.1", +] + [[package]] name = "lru-slab" version = "0.1.2" @@ -5512,7 +5848,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" dependencies = [ "libc", - "wasi", + "wasi 0.11.1+wasi-snapshot-preview1", "windows-sys 0.61.2", ] @@ -5559,7 +5895,7 @@ dependencies = [ "crossbeam-epoch", "crossbeam-utils", "equivalent", - "event-listener", + "event-listener 5.4.1", "futures-util", "parking_lot", "portable-atomic", @@ -5811,12 +6147,40 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "num_threads" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9" +dependencies = [ + "libc", +] + [[package]] name = "number_prefix" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" +[[package]] +name = "oauth2" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c38841cdd844847e3e7c8d29cef9dcfed8877f8f56f9071f77843ecf3baf937f" +dependencies = [ + "base64 0.13.1", + "chrono", + "getrandom 0.2.17", + "http 0.2.12", + "rand 0.8.5", + "serde", + "serde_json", + "serde_path_to_error", + "sha2", + "thiserror 1.0.69", + "url", +] + [[package]] name = "object" version = "0.32.2" @@ -5837,9 +6201,9 @@ dependencies = [ [[package]] name = "object_store" -version = "0.12.4" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c1be0c6c22ec0817cdc77d3842f721a17fd30ab6965001415b5402a74e6b740" +checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" dependencies = [ "async-trait", "base64 0.22.1", @@ -5864,7 +6228,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tracing", "url", @@ -6294,7 +6658,7 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" dependencies = [ - "fastrand", + "fastrand 2.3.0", "phf_shared 0.13.1", ] @@ -6348,6 +6712,17 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "piper" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96c8c490f422ef9a4efd2cb5b42b76c8613d7e7dfc1caf667b8a3350a5acc066" +dependencies = [ + "atomic-waker", + "fastrand 2.3.0", + "futures-io", +] + [[package]] name = "pkcs1" version = "0.7.5" @@ -6430,6 +6805,20 @@ dependencies = [ "plotters-backend", ] +[[package]] +name = "polling" +version = "3.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d0e4f59085d47d8241c88ead0f274e8a0cb551f3625263c05eb8dd897c34218" +dependencies = [ + "cfg-if", + "concurrent-queue", + "hermit-abi", + "pin-project-lite", + "rustix 1.1.3", + "windows-sys 0.61.2", +] + [[package]] name = "portable-atomic" version = "1.13.0" @@ -6685,6 +7074,16 @@ dependencies = [ "memchr", ] +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quick-xml" version = "0.37.5" @@ -6719,7 +7118,7 @@ dependencies = [ "rustc-hash", "rustls 0.23.36", "socket2 0.6.1", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tracing", "web-time", @@ -6740,7 +7139,7 @@ dependencies = [ "rustls 0.23.36", "rustls-pki-types", "slab", - "thiserror 2.0.17", + "thiserror 2.0.18", "tinyvec", "tracing", "web-time", @@ -6781,6 +7180,19 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", +] + [[package]] name = "rand" version = "0.8.5" @@ -6802,6 +7214,16 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -6822,6 +7244,15 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + [[package]] name = "rand_core" version = "0.6.4" @@ -6860,6 +7291,15 @@ dependencies = [ "rand 0.9.2", ] +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + [[package]] name = "rand_xorshift" version = "0.4.0" @@ -6991,7 +7431,7 @@ checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" dependencies = [ "getrandom 0.2.17", "libredox", - "thiserror 2.0.17", + "thiserror 2.0.18", ] [[package]] @@ -7266,9 +7706,9 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.26" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" +checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" [[package]] name = "rustc-hash" @@ -7334,7 +7774,7 @@ dependencies = [ "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.103.8", + "rustls-webpki 0.103.9", "subtle", "zeroize", ] @@ -7362,9 +7802,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.13.2" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21e6f2ab2928ca4291b86736a8bd920a277a399bba1589409d72154ff87c1282" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" dependencies = [ "web-time", "zeroize", @@ -7382,9 +7822,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.8" +version = "0.103.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" +checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" dependencies = [ "aws-lc-rs", "ring", @@ -7631,6 +8071,17 @@ dependencies = [ "serde_core", ] +[[package]] +name = "serde_qs" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6" +dependencies = [ + "percent-encoding", + "serde", + "thiserror 1.0.69", +] + [[package]] name = "serde_repr" version = "0.1.20" @@ -7775,7 +8226,7 @@ checksum = "297f631f50729c8c99b84667867963997ec0b50f32b2a7dbcab828ef0541e8bb" dependencies = [ "num-bigint", "num-traits", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", ] @@ -8184,7 +8635,7 @@ dependencies = [ "itertools 0.14.0", "levenshtein_automata", "log", - "lru", + "lru 0.12.5", "lz4_flex", "measure_time", "memmap2", @@ -8206,7 +8657,7 @@ dependencies = [ "tantivy-stacker", "tantivy-tokenizer-api", "tempfile", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", "uuid", "winapi", @@ -8329,7 +8780,7 @@ version = "3.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c" dependencies = [ - "fastrand", + "fastrand 2.3.0", "getrandom 0.3.4", "once_cell", "rustix 1.1.3", @@ -8375,11 +8826,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl 2.0.17", + "thiserror-impl 2.0.18", ] [[package]] @@ -8395,9 +8846,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", @@ -8441,7 +8892,10 @@ checksum = "f9e442fc33d7fdb45aa9bfeb312c095964abdf596f7567261062b2a7107aaabd" dependencies = [ "deranged", "itoa", + "js-sys", + "libc", "num-conv", + "num_threads", "powerfmt", "serde_core", "time-core", @@ -8851,7 +9305,7 @@ dependencies = [ "serde", "serde_json", "syn 2.0.114", - "thiserror 2.0.17", + "thiserror 2.0.18", "unicode-ident", ] @@ -8872,6 +9326,15 @@ dependencies = [ "typify-impl", ] +[[package]] +name = "tz-rs" +version = "0.6.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33851b15c848fad2cf4b105c6bb66eb9512b6f6c44a4b13f57c53c73c707e2b4" +dependencies = [ + "const_fn", +] + [[package]] name = "unarray" version = "0.1.4" @@ -8980,6 +9443,7 @@ dependencies = [ "idna", "percent-encoding", "serde", + "serde_derive", ] [[package]] @@ -9057,6 +9521,12 @@ dependencies = [ "libc", ] +[[package]] +name = "waker-fn" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "317211a0dc0ceedd78fb2ca9a44aed3d7b9b26f81870d485c07122b4350673b7" + [[package]] name = "walkdir" version = "2.5.0" @@ -9076,6 +9546,12 @@ dependencies = [ "try-lock", ] +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -9084,18 +9560,18 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasip2" -version = "1.0.1+wasi-0.2.4" +version = "1.0.2+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" dependencies = [ "wit-bindgen", ] [[package]] name = "wasm-bindgen" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d759f433fa64a2d763d1340820e46e111a7a5ab75f993d1852d70b03dbb80fd" +checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" dependencies = [ "cfg-if", "once_cell", @@ -9106,11 +9582,12 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.56" +version = "0.4.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c" +checksum = "70a6e77fd0ae8029c9ea0063f87c46fde723e7d887703d74ad2616d792e51e6f" dependencies = [ "cfg-if", + "futures-util", "js-sys", "once_cell", "wasm-bindgen", @@ -9119,9 +9596,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48cb0d2638f8baedbc542ed444afc0644a29166f1595371af4fecf8ce1e7eeb3" +checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -9129,9 +9606,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cefb59d5cd5f92d9dcf80e4683949f15ca4b511f4ac0a6e14d4e1ac60c6ecd40" +checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" dependencies = [ "bumpalo", "proc-macro2", @@ -9142,9 +9619,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbc538057e648b67f72a982e708d485b2efa771e1ac05fec311f9f63e5800db4" +checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" dependencies = [ "unicode-ident", ] @@ -9164,9 +9641,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.83" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b32828d774c412041098d182a8b38b16ea816958e07cf40eec2bc080ae137ac" +checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" dependencies = [ "js-sys", "wasm-bindgen", @@ -9566,9 +10043,9 @@ dependencies = [ [[package]] name = "wit-bindgen" -version = "0.46.0" +version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" [[package]] name = "wkb" @@ -9764,9 +10241,9 @@ checksum = "40990edd51aae2c2b6907af74ffb635029d5788228222c4bb811e9351c0caad3" [[package]] name = "zmij" -version = "1.0.14" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd8f3f50b848df28f887acb68e41201b5aea6bc8a8dacc00fb40635ff9a72fea" +checksum = "dfcd145825aace48cff44a8844de64bf75feec3080e0aa5cdbde72961ae51a65" [[package]] name = "zstd" diff --git a/Cargo.toml b/Cargo.toml index 1b8d05eff6f..4cf3883ed23 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -63,7 +63,7 @@ lance-io = { version = "=1.0.3-beta.0", path = "./rust/lance-io", default-featur lance-linalg = { version = "=1.0.3-beta.0", path = "./rust/lance-linalg" } lance-namespace = { version = "=1.0.3-beta.0", path = "./rust/lance-namespace" } lance-namespace-impls = { version = "=1.0.3-beta.0", path = "./rust/lance-namespace-impls" } -lance-namespace-reqwest-client = "0.0.18" +lance-namespace-reqwest-client = { version = "=0.4.5" } lance-table = { version = "=1.0.3-beta.0", path = "./rust/lance-table" } lance-test-macros = { version = "=1.0.3-beta.0", path = "./rust/lance-test-macros" } lance-testing = { version = "=1.0.3-beta.0", path = "./rust/lance-testing" } diff --git a/deny.toml b/deny.toml index ba5eed05786..cc7cd6d6023 100644 --- a/deny.toml +++ b/deny.toml @@ -85,7 +85,9 @@ ignore = [ { id = "RUSTSEC-2024-0436", reason = "`paste` is used by datafusion" }, { id = "RUSTSEC-2023-0071", reason = "`rsa` is used by opendal via reqsign" }, { id = "RUSTSEC-2025-0119", reason = "`number_prefix` used by hf-hub in examples" }, - { id = "RUSTSEC-2025-0134", reason = "`rustls-pemfile` unmaintained; awaiting upstream object_store/hyper-rustls migration to rustls-pki-types" } + { id = "RUSTSEC-2025-0134", reason = "`rustls-pemfile` unmaintained; awaiting upstream object_store/hyper-rustls migration to rustls-pki-types" }, + { id = "RUSTSEC-2025-0141", reason = "`bincode` is unmaintained and used by tantivy"}, + { id = "RUSTSEC-2026-0002", reason = "`lru` is used by tantivy and aws-sdk-s3"}, ] # If this is true, then cargo deny will use the git executable to fetch advisory database. # If this is false, then it uses a built-in git library. diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index e6406cec933..8a85397603e 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "RustyXML" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5" + [[package]] name = "adler2" version = "2.0.1" @@ -384,6 +390,17 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "async-channel" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" +dependencies = [ + "concurrent-queue", + "event-listener 2.5.3", + "futures-core", +] + [[package]] name = "async-channel" version = "2.5.0" @@ -413,17 +430,53 @@ dependencies = [ "zstd-safe", ] +[[package]] +name = "async-io" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "456b8a8feb6f42d237746d4b3e9a178494627745c3c56c6ea55d92ba50d026fc" +dependencies = [ + "autocfg", + "cfg-if", + "concurrent-queue", + "futures-io", + "futures-lite 2.6.1", + "parking", + "polling", + "rustix 1.1.3", + "slab", + "windows-sys 0.61.2", +] + [[package]] name = "async-lock" version = "3.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f7f2596bd5b78a9fec8088ccd89180d7f9f55b94b0576823bbbdc72ee8311" dependencies = [ - "event-listener", + "event-listener 5.4.1", "event-listener-strategy", "pin-project-lite", ] +[[package]] +name = "async-process" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc50921ec0055cdd8a16de48773bfeec5c972598674347252c0399676be7da75" +dependencies = [ + "async-channel 2.5.0", + "async-io", + "async-lock", + "async-signal", + "async-task", + "blocking", + "cfg-if", + "event-listener 5.4.1", + "futures-lite 2.6.1", + "rustix 1.1.3", +] + [[package]] name = "async-recursion" version = "1.1.1" @@ -435,6 +488,30 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "async-signal" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43c070bbf59cd3570b6b2dd54cd772527c7c3620fce8be898406dd3ed6adc64c" +dependencies = [ + "async-io", + "async-lock", + "atomic-waker", + "cfg-if", + "futures-core", + "futures-io", + "rustix 1.1.3", + "signal-hook-registry", + "slab", + "windows-sys 0.61.2", +] + +[[package]] +name = "async-task" +version = "4.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" + [[package]] name = "async-trait" version = "0.1.89" @@ -495,7 +572,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "hex", "http 1.4.0", "ring", @@ -520,9 +597,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.15.2" +version = "1.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a88aab2464f1f25453baa7a07c84c5b7684e274054ba06817f382357f77a288" +checksum = "e84ce723ab67259cfeb9877c6a639ee9eb7a27b28123abd71db7f0d5d0cc9d86" dependencies = [ "aws-lc-sys", "zeroize", @@ -530,9 +607,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.35.0" +version = "0.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b45afffdee1e7c9126814751f88dddc747f41d91da16c9551a0f1e8a11e788a1" +checksum = "43a442ece363113bd4bd4c8b18977a7798dd4d3c3383f34fb61936960e8f4ad8" dependencies = [ "cc", "cmake", @@ -542,9 +619,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.17" +version = "1.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d81b5b2898f6798ad58f484856768bca817e3cd9de0974c24ae0f1113fe88f1b" +checksum = "959dab27ce613e6c9658eb3621064d0e2027e5f2acb65bc526a43577facea557" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -555,7 +632,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "http-body 0.4.6", "percent-encoding", @@ -566,21 +643,22 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.91.0" +version = "1.92.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ee6402a36f27b52fe67661c6732d684b2635152b676aa2babbfb5204f99115d" +checksum = "b7d63bd2bdeeb49aa3f9b00c15e18583503b778b2e792fc06284d54e7d5b6566" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -588,21 +666,22 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.93.0" +version = "1.94.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a45a7f750bbd170ee3677671ad782d90b894548f4e4ae168302c57ec9de5cb3e" +checksum = "532d93574bf731f311bafb761366f9ece345a0416dbcc273d81d6d1a1205239b" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -610,22 +689,23 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.95.0" +version = "1.96.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55542378e419558e6b1f398ca70adb0b2088077e79ad9f14eb09441f2f7b2164" +checksum = "357e9a029c7524db6a0099cd77fbd5da165540339e7296cca603531bc783b56c" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-smithy-xml", "aws-types", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -694,17 +774,23 @@ dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", "aws-smithy-types", - "h2", + "h2 0.3.27", + "h2 0.4.13", + "http 0.2.12", "http 1.4.0", - "hyper", - "hyper-rustls", + "http-body 0.4.6", + "hyper 0.14.32", + "hyper 1.8.1", + "hyper-rustls 0.24.2", + "hyper-rustls 0.27.7", "hyper-util", "pin-project-lite", - "rustls", + "rustls 0.21.12", + "rustls 0.23.36", "rustls-native-certs", "rustls-pki-types", "tokio", - "tokio-rustls", + "tokio-rustls 0.26.4", "tower", "tracing", ] @@ -720,9 +806,9 @@ dependencies = [ [[package]] name = "aws-smithy-observability" -version = "0.1.5" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f616c3f2260612fe44cede278bafa18e73e6479c4e393e2c4518cf2a9a228a" +checksum = "ef1fcbefc7ece1d70dcce29e490f269695dfca2d2bacdeaf9e5c3f799e4e6a42" dependencies = [ "aws-smithy-runtime-api", ] @@ -739,9 +825,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.9.5" +version = "1.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a392db6c583ea4a912538afb86b7be7c5d8887d91604f50eb55c262ee1b4a5f5" +checksum = "bb5b6167fcdf47399024e81ac08e795180c576a20e4d4ce67949f9a88ae37dc1" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -750,7 +836,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "http 1.4.0", "http-body 0.4.6", @@ -763,9 +849,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.9.3" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab0d43d899f9e508300e587bf582ba54c27a452dd0a9ea294690669138ae14a2" +checksum = "efce7aaaf59ad53c5412f14fc19b2d5c6ab2c3ec688d272fd31f76ec12f44fb0" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -780,13 +866,14 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.3.5" +version = "1.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "905cb13a9895626d49cf2ced759b062d913834c7482c38e49557eac4e6193f01" +checksum = "65f172bcb02424eb94425db8aed1b6d583b5104d4d5ddddf22402c661a320048" dependencies = [ "base64-simd", "bytes", "bytes-utils", + "futures-core", "http 0.2.12", "http 1.4.0", "http-body 0.4.6", @@ -799,6 +886,8 @@ dependencies = [ "ryu", "serde", "time", + "tokio", + "tokio-util", ] [[package]] @@ -837,7 +926,7 @@ dependencies = [ "http 1.4.0", "http-body 1.0.1", "http-body-util", - "hyper", + "hyper 1.8.1", "hyper-util", "itoa", "matchit", @@ -879,17 +968,130 @@ dependencies = [ "tracing", ] +[[package]] +name = "azure_core" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b552ad43a45a746461ec3d3a51dfb6466b4759209414b439c165eb6a6b7729e" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes", + "dyn-clone", + "futures", + "getrandom 0.2.17", + "hmac", + "http-types", + "once_cell", + "paste", + "pin-project", + "quick-xml 0.31.0", + "rand 0.8.5", + "reqwest", + "rustc_version", + "serde", + "serde_json", + "sha2", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_identity" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ddd80344317c40c04b603807b63a5cefa532f1b43522e72f480a988141f744" +dependencies = [ + "async-lock", + "async-process", + "async-trait", + "azure_core", + "futures", + "oauth2", + "pin-project", + "serde", + "time", + "tracing", + "tz-rs", + "url", + "uuid", +] + +[[package]] +name = "azure_storage" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59f838159f4d29cb400a14d9d757578ba495ae64feb07a7516bf9e4415127126" +dependencies = [ + "RustyXML", + "async-lock", + "async-trait", + "azure_core", + "bytes", + "serde", + "serde_derive", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_storage_blobs" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97e83c3636ae86d9a6a7962b2112e3b19eb3903915c50ce06ff54ff0a2e6a7e4" +dependencies = [ + "RustyXML", + "azure_core", + "azure_storage", + "azure_svc_blobstorage", + "bytes", + "futures", + "serde", + "serde_derive", + "serde_json", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_svc_blobstorage" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e6c6f20c5611b885ba94c7bae5e02849a267381aecb8aee577e8c35ff4064c6" +dependencies = [ + "azure_core", + "bytes", + "futures", + "log", + "once_cell", + "serde", + "serde_json", + "time", +] + [[package]] name = "backon" version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cffb0e931875b666fc4fcb20fee52e9bbd1ef836fd9e9e04ec21555f9f85f7ef" dependencies = [ - "fastrand", + "fastrand 2.3.0", "gloo-timers", "tokio", ] +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.21.7" @@ -999,6 +1201,19 @@ dependencies = [ "generic-array", ] +[[package]] +name = "blocking" +version = "1.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e83f8d02be6967315521be875afa792a316e28d57b5a2d401897e2a7921b7f21" +dependencies = [ + "async-channel 2.5.0", + "async-task", + "futures-io", + "futures-lite 2.6.1", + "piper", +] + [[package]] name = "bon" version = "3.8.2" @@ -1015,7 +1230,7 @@ version = "3.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89ec27229c38ed0eb3c0feee3d2c1d6a4379ae44f418a29a658890e062d8f365" dependencies = [ - "darling", + "darling 0.23.0", "ident_case", "prettyplease", "proc-macro2", @@ -1118,9 +1333,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.52" +version = "1.2.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd4932aefd12402b36c60956a4fe0035421f544799057659ff86f923657aada3" +checksum = "755d2fce177175ffca841e9a06afdb2c4ab0f593d53b4dee48147dfaade85932" dependencies = [ "find-msvc-tools", "jobserver", @@ -1154,9 +1369,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.42" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" +checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" dependencies = [ "iana-time-zone", "js-sys", @@ -1257,12 +1472,28 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "const_fn" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f8a2ca5ac02d09563609681103aada9e1777d54fc57a5acd7a41404f9c93b6e" + [[package]] name = "constant_time_eq" version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation" version = "0.10.1" @@ -1386,14 +1617,38 @@ dependencies = [ "memchr", ] +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core 0.20.11", + "darling_macro 0.20.11", +] + [[package]] name = "darling" version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" dependencies = [ - "darling_core", - "darling_macro", + "darling_core 0.23.0", + "darling_macro 0.23.0", +] + +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.114", ] [[package]] @@ -1409,13 +1664,24 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core 0.20.11", + "quote", + "syn 2.0.114", +] + [[package]] name = "darling_macro" version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ - "darling_core", + "darling_core 0.23.0", "quote", "syn 2.0.114", ] @@ -2138,6 +2404,37 @@ dependencies = [ "serde_core", ] +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling 0.20.11", + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn 2.0.114", +] + [[package]] name = "digest" version = "0.10.7" @@ -2279,6 +2576,12 @@ version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca81e6b4777c89fd810c25a4be2b1bd93ea034fbe58e6a75216a34c6b82c539b" +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + [[package]] name = "event-listener" version = "5.4.1" @@ -2296,7 +2599,7 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" dependencies = [ - "event-listener", + "event-listener 5.4.1", "pin-project-lite", ] @@ -2312,6 +2615,15 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9afc2bd4d5a73106dd53d10d73d3401c2f32730ba2c0b93ddb888a8983680471" +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -2320,9 +2632,9 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "find-msvc-tools" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f449e6c6c08c865631d4890cfacf252b3d396c9bcc83adb6623cdb02a8336c41" +checksum = "8591b0bcc8a98a64310a2fae1bb3e9b8564dd10e381e6e28010fde8e8e8568db" [[package]] name = "fixedbitset" @@ -2375,6 +2687,21 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -2471,6 +2798,34 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +[[package]] +name = "futures-lite" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" +dependencies = [ + "fastrand 1.9.0", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + +[[package]] +name = "futures-lite" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f78e10609fe0e0b3f4157ffab1876319b5b0db102a2c60dc4626306dc46b44ad" +dependencies = [ + "fastrand 2.3.0", + "futures-core", + "futures-io", + "parking", + "pin-project-lite", +] + [[package]] name = "futures-macro" version = "0.3.31" @@ -2659,6 +3014,17 @@ dependencies = [ "libm", ] +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -2668,7 +3034,7 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi", + "wasi 0.11.1+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -2704,6 +3070,45 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "google-cloud-auth" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5572275b7f06b6fde8eec61a23d87c83aae362bee586bbeb8773b3f98658ae81" +dependencies = [ + "async-trait", + "base64 0.22.1", + "derive_builder", + "http 1.4.0", + "reqwest", + "rustls 0.23.36", + "rustls-pemfile", + "serde", + "serde_json", + "thiserror 2.0.18", + "time", + "tokio", +] + +[[package]] +name = "h2" +version = "0.3.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http 0.2.12", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "h2" version = "0.4.13" @@ -2883,6 +3288,26 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-types" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e9b187a72d63adbfba487f48095306ac823049cb504ee195541e91c7775f5ad" +dependencies = [ + "anyhow", + "async-channel 1.9.0", + "base64 0.13.1", + "futures-lite 1.13.0", + "infer", + "pin-project-lite", + "rand 0.7.3", + "serde", + "serde_json", + "serde_qs", + "serde_urlencoded", + "url", +] + [[package]] name = "httparse" version = "1.10.1" @@ -2901,6 +3326,30 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" +[[package]] +name = "hyper" +version = "0.14.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2 0.3.27", + "http 0.2.12", + "http-body 0.4.6", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2 0.5.10", + "tokio", + "tower-service", + "tracing", + "want", +] + [[package]] name = "hyper" version = "1.8.1" @@ -2911,7 +3360,7 @@ dependencies = [ "bytes", "futures-channel", "futures-core", - "h2", + "h2 0.4.13", "http 1.4.0", "http-body 1.0.1", "httparse", @@ -2924,6 +3373,21 @@ dependencies = [ "want", ] +[[package]] +name = "hyper-rustls" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" +dependencies = [ + "futures-util", + "http 0.2.12", + "hyper 0.14.32", + "log", + "rustls 0.21.12", + "tokio", + "tokio-rustls 0.24.1", +] + [[package]] name = "hyper-rustls" version = "0.27.7" @@ -2931,15 +3395,31 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" dependencies = [ "http 1.4.0", - "hyper", + "hyper 1.8.1", + "hyper-util", + "rustls 0.23.36", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls 0.26.4", + "tower-service", + "webpki-roots", +] + +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper 1.8.1", "hyper-util", - "rustls", - "rustls-native-certs", - "rustls-pki-types", + "native-tls", "tokio", - "tokio-rustls", + "tokio-native-tls", "tower-service", - "webpki-roots", ] [[package]] @@ -2955,15 +3435,17 @@ dependencies = [ "futures-util", "http 1.4.0", "http-body 1.0.1", - "hyper", + "hyper 1.8.1", "ipnet", "libc", "percent-encoding", "pin-project-lite", - "socket2", + "socket2 0.6.1", + "system-configuration", "tokio", "tower-service", "tracing", + "windows-registry", ] [[package]] @@ -3160,6 +3642,12 @@ dependencies = [ "hashbrown 0.16.1", ] +[[package]] +name = "infer" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64e9829a50b42bb782c1df523f78d332fe371b10c661e78b7a3c34b0198e9fac" + [[package]] name = "inout" version = "0.1.4" @@ -3170,6 +3658,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", +] + [[package]] name = "integer-encoding" version = "3.0.4" @@ -3306,9 +3803,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.83" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "464a3709c7f55f1f721e5389aa6ea4e3bc6aba669353300af094b29ffbdde1d8" +checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" dependencies = [ "once_cell", "wasm-bindgen", @@ -3614,7 +4111,7 @@ dependencies = [ "arrow-ord", "arrow-schema", "arrow-select", - "async-channel", + "async-channel 2.5.0", "async-recursion", "async-trait", "bitpacking", @@ -3774,9 +4271,19 @@ dependencies = [ "arrow-ipc", "arrow-schema", "async-trait", + "aws-config", + "aws-credential-types", + "aws-sdk-sts", "axum", + "azure_core", + "azure_identity", + "azure_storage", + "azure_storage_blobs", + "base64 0.22.1", "bytes", + "chrono", "futures", + "google-cloud-auth", "lance", "lance-core", "lance-index", @@ -3788,7 +4295,9 @@ dependencies = [ "reqwest", "serde", "serde_json", + "sha2", "snafu", + "time", "tokio", "tower", "tower-http 0.5.2", @@ -3797,9 +4306,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.0.18" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ea349999bcda4eea53fc05d334b3775ec314761e6a706555c777d7a29b18d19" +checksum = "a2acdba67f84190067532fce07b51a435dd390d7cdc1129a05003e5cb3274cf0" dependencies = [ "reqwest", "serde", @@ -4146,7 +4655,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" dependencies = [ "libc", - "wasi", + "wasi 0.11.1+wasi-snapshot-preview1", "windows-sys 0.61.2", ] @@ -4167,7 +4676,7 @@ dependencies = [ "crossbeam-epoch", "crossbeam-utils", "equivalent", - "event-listener", + "event-listener 5.4.1", "futures-util", "parking_lot", "portable-atomic", @@ -4188,6 +4697,23 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" +[[package]] +name = "native-tls" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe 0.1.6", + "openssl-sys", + "schannel", + "security-framework 2.11.1", + "security-framework-sys", + "tempfile", +] + [[package]] name = "ndarray" version = "0.16.1" @@ -4359,6 +4885,34 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "num_threads" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9" +dependencies = [ + "libc", +] + +[[package]] +name = "oauth2" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c38841cdd844847e3e7c8d29cef9dcfed8877f8f56f9071f77843ecf3baf937f" +dependencies = [ + "base64 0.13.1", + "chrono", + "getrandom 0.2.17", + "http 0.2.12", + "rand 0.8.5", + "serde", + "serde_json", + "serde_path_to_error", + "sha2", + "thiserror 1.0.69", + "url", +] + [[package]] name = "object" version = "0.32.2" @@ -4370,9 +4924,9 @@ dependencies = [ [[package]] name = "object_store" -version = "0.12.4" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c1be0c6c22ec0817cdc77d3842f721a17fd30ab6965001415b5402a74e6b740" +checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" dependencies = [ "async-trait", "base64 0.22.1", @@ -4384,7 +4938,7 @@ dependencies = [ "http-body-util", "httparse", "humantime", - "hyper", + "hyper 1.8.1", "itertools 0.14.0", "md-5", "parking_lot", @@ -4397,7 +4951,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tracing", "url", @@ -4470,12 +5024,56 @@ dependencies = [ "uuid", ] +[[package]] +name = "openssl" +version = "0.10.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" +dependencies = [ + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "openssl-probe" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" + [[package]] name = "openssl-probe" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f50d9b3dabb09ecd771ad0aa242ca6894994c130308ca3d7684634df8037391" +[[package]] +name = "openssl-sys" +version = "0.9.111" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -4759,6 +5357,17 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "piper" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96c8c490f422ef9a4efd2cb5b42b76c8613d7e7dfc1caf667b8a3350a5acc066" +dependencies = [ + "atomic-waker", + "fastrand 2.3.0", + "futures-io", +] + [[package]] name = "pkcs1" version = "0.7.5" @@ -4803,6 +5412,20 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "polling" +version = "3.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d0e4f59085d47d8241c88ead0f274e8a0cb551f3625263c05eb8dd897c34218" +dependencies = [ + "cfg-if", + "concurrent-queue", + "hermit-abi", + "pin-project-lite", + "rustix 1.1.3", + "windows-sys 0.61.2", +] + [[package]] name = "portable-atomic" version = "1.13.0" @@ -4932,6 +5555,16 @@ dependencies = [ "cc", ] +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quick-xml" version = "0.37.5" @@ -4964,9 +5597,9 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash", - "rustls", - "socket2", - "thiserror 2.0.17", + "rustls 0.23.36", + "socket2 0.6.1", + "thiserror 2.0.18", "tokio", "tracing", "web-time", @@ -4984,10 +5617,10 @@ dependencies = [ "rand 0.9.2", "ring", "rustc-hash", - "rustls", + "rustls 0.23.36", "rustls-pki-types", "slab", - "thiserror 2.0.17", + "thiserror 2.0.18", "tinyvec", "tracing", "web-time", @@ -5002,7 +5635,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2", + "socket2 0.6.1", "tracing", "windows-sys 0.60.2", ] @@ -5028,6 +5661,19 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", +] + [[package]] name = "rand" version = "0.8.5" @@ -5049,6 +5695,16 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -5069,6 +5725,15 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + [[package]] name = "rand_core" version = "0.6.4" @@ -5107,6 +5772,15 @@ dependencies = [ "rand 0.9.2", ] +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + [[package]] name = "rand_xoshiro" version = "0.7.0" @@ -5198,7 +5872,7 @@ checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" dependencies = [ "getrandom 0.2.17", "libredox", - "thiserror 2.0.17", + "thiserror 2.0.18", ] [[package]] @@ -5289,21 +5963,23 @@ dependencies = [ "encoding_rs", "futures-core", "futures-util", - "h2", + "h2 0.4.13", "http 1.4.0", "http-body 1.0.1", "http-body-util", - "hyper", - "hyper-rustls", + "hyper 1.8.1", + "hyper-rustls 0.27.7", + "hyper-tls", "hyper-util", "js-sys", "log", "mime", "mime_guess", + "native-tls", "percent-encoding", "pin-project-lite", "quinn", - "rustls", + "rustls 0.23.36", "rustls-native-certs", "rustls-pki-types", "serde", @@ -5311,7 +5987,8 @@ dependencies = [ "serde_urlencoded", "sync_wrapper", "tokio", - "tokio-rustls", + "tokio-native-tls", + "tokio-rustls 0.26.4", "tokio-util", "tower", "tower-http 0.6.8", @@ -5447,6 +6124,18 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "rustls" +version = "0.21.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" +dependencies = [ + "log", + "ring", + "rustls-webpki 0.101.7", + "sct", +] + [[package]] name = "rustls" version = "0.23.36" @@ -5454,10 +6143,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" dependencies = [ "aws-lc-rs", + "log", "once_cell", "ring", "rustls-pki-types", - "rustls-webpki", + "rustls-webpki 0.103.9", "subtle", "zeroize", ] @@ -5468,10 +6158,10 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" dependencies = [ - "openssl-probe", + "openssl-probe 0.2.0", "rustls-pki-types", "schannel", - "security-framework", + "security-framework 3.5.1", ] [[package]] @@ -5485,9 +6175,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.13.2" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21e6f2ab2928ca4291b86736a8bd920a277a399bba1589409d72154ff87c1282" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" dependencies = [ "web-time", "zeroize", @@ -5495,9 +6185,19 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.8" +version = "0.101.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" +checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" dependencies = [ "aws-lc-rs", "ring", @@ -5591,6 +6291,29 @@ dependencies = [ "sha2", ] +[[package]] +name = "sct" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags", + "core-foundation 0.9.4", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + [[package]] name = "security-framework" version = "3.5.1" @@ -5598,7 +6321,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" dependencies = [ "bitflags", - "core-foundation", + "core-foundation 0.10.1", "core-foundation-sys", "libc", "security-framework-sys", @@ -5695,6 +6418,17 @@ dependencies = [ "serde_core", ] +[[package]] +name = "serde_qs" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6" +dependencies = [ + "percent-encoding", + "serde", + "thiserror 1.0.69", +] + [[package]] name = "serde_repr" version = "0.1.20" @@ -5829,7 +6563,7 @@ checksum = "297f631f50729c8c99b84667867963997ec0b50f32b2a7dbcab828ef0541e8bb" dependencies = [ "num-bigint", "num-traits", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", ] @@ -5887,6 +6621,16 @@ version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" +[[package]] +name = "socket2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "socket2" version = "0.6.1" @@ -6079,6 +6823,27 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tagptr" version = "0.2.0" @@ -6131,7 +6896,7 @@ dependencies = [ "tantivy-stacker", "tantivy-tokenizer-api", "tempfile", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", "uuid", "winapi", @@ -6243,7 +7008,7 @@ version = "3.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c" dependencies = [ - "fastrand", + "fastrand 2.3.0", "getrandom 0.3.4", "once_cell", "rustix 1.1.3", @@ -6261,11 +7026,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl 2.0.17", + "thiserror-impl 2.0.18", ] [[package]] @@ -6281,9 +7046,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", @@ -6327,7 +7092,10 @@ checksum = "f9e442fc33d7fdb45aa9bfeb312c095964abdf596f7567261062b2a7107aaabd" dependencies = [ "deranged", "itoa", + "js-sys", + "libc", "num-conv", + "num_threads", "powerfmt", "serde_core", "time-core", @@ -6396,7 +7164,7 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2", + "socket2 0.6.1", "tokio-macros", "windows-sys 0.61.2", ] @@ -6412,13 +7180,33 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" +dependencies = [ + "rustls 0.21.12", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" dependencies = [ - "rustls", + "rustls 0.23.36", "tokio", ] @@ -6653,7 +7441,7 @@ dependencies = [ "serde", "serde_json", "syn 2.0.114", - "thiserror 2.0.17", + "thiserror 2.0.18", "unicode-ident", ] @@ -6674,6 +7462,15 @@ dependencies = [ "typify-impl", ] +[[package]] +name = "tz-rs" +version = "0.6.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33851b15c848fad2cf4b105c6bb66eb9512b6f6c44a4b13f57c53c73c707e2b4" +dependencies = [ + "const_fn", +] + [[package]] name = "unicase" version = "2.9.0" @@ -6720,6 +7517,7 @@ dependencies = [ "idna", "percent-encoding", "serde", + "serde_derive", ] [[package]] @@ -6764,6 +7562,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.5" @@ -6776,6 +7580,12 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" +[[package]] +name = "waker-fn" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "317211a0dc0ceedd78fb2ca9a44aed3d7b9b26f81870d485c07122b4350673b7" + [[package]] name = "walkdir" version = "2.5.0" @@ -6795,6 +7605,12 @@ dependencies = [ "try-lock", ] +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -6803,18 +7619,18 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasip2" -version = "1.0.1+wasi-0.2.4" +version = "1.0.2+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" dependencies = [ "wit-bindgen", ] [[package]] name = "wasm-bindgen" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d759f433fa64a2d763d1340820e46e111a7a5ab75f993d1852d70b03dbb80fd" +checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" dependencies = [ "cfg-if", "once_cell", @@ -6825,11 +7641,12 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.56" +version = "0.4.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c" +checksum = "70a6e77fd0ae8029c9ea0063f87c46fde723e7d887703d74ad2616d792e51e6f" dependencies = [ "cfg-if", + "futures-util", "js-sys", "once_cell", "wasm-bindgen", @@ -6838,9 +7655,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48cb0d2638f8baedbc542ed444afc0644a29166f1595371af4fecf8ce1e7eeb3" +checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -6848,9 +7665,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cefb59d5cd5f92d9dcf80e4683949f15ca4b511f4ac0a6e14d4e1ac60c6ecd40" +checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" dependencies = [ "bumpalo", "proc-macro2", @@ -6861,9 +7678,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbc538057e648b67f72a982e708d485b2efa771e1ac05fec311f9f63e5800db4" +checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" dependencies = [ "unicode-ident", ] @@ -6883,9 +7700,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.83" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b32828d774c412041098d182a8b38b16ea816958e07cf40eec2bc080ae137ac" +checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" dependencies = [ "js-sys", "wasm-bindgen", @@ -6982,6 +7799,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-registry" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" +dependencies = [ + "windows-link", + "windows-result", + "windows-strings", +] + [[package]] name = "windows-result" version = "0.4.1" @@ -7242,9 +8070,9 @@ dependencies = [ [[package]] name = "wit-bindgen" -version = "0.46.0" +version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" [[package]] name = "wkb" @@ -7418,9 +8246,9 @@ checksum = "40990edd51aae2c2b6907af74ffb635029d5788228222c4bb811e9351c0caad3" [[package]] name = "zmij" -version = "1.0.14" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd8f3f50b848df28f887acb68e41201b5aea6bc8a8dacc00fb40635ff9a72fea" +checksum = "dfcd145825aace48cff44a8844de64bf75feec3080e0aa5cdbde72961ae51a65" [[package]] name = "zstd" diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index 132333a1ec1..8d426cb3e84 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -12,6 +12,13 @@ description = "JNI bindings for Lance Columnar format" [lib] crate-type = ["cdylib"] +[features] +default = [] +# Credential vending features for DirectoryNamespace +credential-vendor-aws = ["lance-namespace-impls/credential-vendor-aws"] +credential-vendor-gcp = ["lance-namespace-impls/credential-vendor-gcp"] +credential-vendor-azure = ["lance-namespace-impls/credential-vendor-azure"] + [dependencies] lance = { path = "../../rust/lance", features = ["substrait"] } lance-datafusion = { path = "../../rust/lance-datafusion" } @@ -20,7 +27,7 @@ lance-linalg = { path = "../../rust/lance-linalg" } lance-index = { path = "../../rust/lance-index" } lance-io = { path = "../../rust/lance-io" } lance-namespace = { path = "../../rust/lance-namespace" } -lance-namespace-impls = { path = "../../rust/lance-namespace-impls", features = ["rest", "rest-adapter"] } +lance-namespace-impls = { path = "../../rust/lance-namespace-impls", features = ["rest", "rest-adapter", "credential-vendor-aws", "credential-vendor-gcp", "credential-vendor-azure"] } lance-core = { path = "../../rust/lance-core" } lance-file = { path = "../../rust/lance-file" } arrow = { version = "56.1", features = ["ffi"] } diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index b15132ad00b..ad14d02d6da 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -59,16 +59,32 @@ pub struct BlockingDataset { } impl BlockingDataset { - /// Get the storage options provider that was used when opening this dataset - pub fn get_storage_options_provider(&self) -> Option> { - self.inner.storage_options_provider() + /// Get the initial storage options used to open this dataset. + /// + /// Returns the options that were provided when the dataset was opened, + /// without any refresh from the provider. Returns None if no storage options + /// were provided. + pub fn initial_storage_options(&self) -> Option> { + self.inner.initial_storage_options().cloned() + } + + /// Get the latest storage options, potentially refreshed from the provider. + /// + /// If a storage options provider was configured and credentials are expiring, + /// this will refresh them. + pub fn latest_storage_options(&self) -> Result>> { + RT.block_on(async { self.inner.latest_storage_options().await }) + .map(|opt| opt.map(|opts| opts.0)) + .map_err(|e| Error::io_error(e.to_string())) } pub fn drop(uri: &str, storage_options: HashMap) -> Result<()> { RT.block_on(async move { let registry = Arc::new(ObjectStoreRegistry::default()); let object_store_params = ObjectStoreParams { - storage_options: Some(storage_options.clone()), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )), ..Default::default() }; let (object_store, path) = @@ -100,20 +116,29 @@ impl BlockingDataset { storage_options: HashMap, serialized_manifest: Option<&[u8]>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, ) -> Result { - let mut store_params = ObjectStoreParams { + // Create storage options accessor from storage_options and provider + let accessor = match (storage_options.is_empty(), storage_options_provider) { + (false, Some(provider)) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_initial_and_provider( + storage_options, + provider, + ), + )), + (false, None) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )), + (true, Some(provider)) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_provider(provider), + )), + (true, None) => None, + }; + + let store_params = ObjectStoreParams { block_size: block_size.map(|size| size as usize), - storage_options: Some(storage_options.clone()), + storage_options_accessor: accessor, ..Default::default() }; - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - store_params.s3_credentials_refresh_offset = - std::time::Duration::from_secs(offset_seconds); - } - if let Some(provider) = storage_options_provider.clone() { - store_params.storage_options_provider = Some(provider); - } let params = ReadParams { index_cache_size_bytes: index_cache_size_bytes as usize, metadata_cache_size_bytes: metadata_cache_size_bytes as usize, @@ -126,14 +151,6 @@ impl BlockingDataset { if let Some(ver) = version { builder = builder.with_version(ver as u64); } - builder = builder.with_storage_options(storage_options); - if let Some(provider) = storage_options_provider.clone() { - builder = builder.with_storage_options_provider(provider) - } - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - builder = builder - .with_s3_credentials_refresh_offset(std::time::Duration::from_secs(offset_seconds)); - } if let Some(serialized_manifest) = serialized_manifest { builder = builder.with_serialized_manifest(serialized_manifest)?; @@ -149,12 +166,19 @@ impl BlockingDataset { read_version: Option, storage_options: HashMap, ) -> Result { + let accessor = if storage_options.is_empty() { + None + } else { + Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )) + }; let inner = RT.block_on(Dataset::commit( uri, operation, read_version, Some(ObjectStoreParams { - storage_options: Some(storage_options), + storage_options_accessor: accessor, ..Default::default() }), None, @@ -336,7 +360,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiSchema<'local>( enable_stable_row_ids: JObject, // Optional data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw!( env, @@ -351,7 +374,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiSchema<'local>( enable_stable_row_ids, data_storage_version, storage_options_obj, - s3_credentials_refresh_offset_seconds_obj ) ) } @@ -368,7 +390,6 @@ fn inner_create_with_ffi_schema<'local>( enable_stable_row_ids: JObject, // Optional data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let c_schema_ptr = arrow_schema_addr as *mut FFI_ArrowSchema; let c_schema = unsafe { FFI_ArrowSchema::from_raw(c_schema_ptr) }; @@ -386,7 +407,6 @@ fn inner_create_with_ffi_schema<'local>( data_storage_version, storage_options_obj, JObject::null(), // No provider for schema-only creation - s3_credentials_refresh_offset_seconds_obj, reader, ) } @@ -418,7 +438,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiStream<'local>( enable_stable_row_ids: JObject, // Optional data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw!( env, @@ -434,7 +453,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiStream<'local>( data_storage_version, storage_options_obj, JObject::null(), - s3_credentials_refresh_offset_seconds_obj ) ) } @@ -453,7 +471,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiStreamAndProvider<'lo data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw!( env, @@ -469,7 +486,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiStreamAndProvider<'lo data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj ) ) } @@ -487,7 +503,6 @@ fn inner_create_with_ffi_stream<'local>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let stream_ptr = arrow_array_stream_addr as *mut FFI_ArrowArrayStream; let reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }?; @@ -502,7 +517,6 @@ fn inner_create_with_ffi_stream<'local>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj, reader, ) } @@ -519,7 +533,6 @@ fn create_dataset<'local>( data_storage_version: JObject, storage_options_obj: JObject, storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, reader: impl RecordBatchReader + Send + 'static, ) -> Result> { let path_str = path.extract(env)?; @@ -534,7 +547,6 @@ fn create_dataset<'local>( &data_storage_version, &storage_options_obj, &storage_options_provider_obj, - &s3_credentials_refresh_offset_seconds_obj, )?; let dataset = BlockingDataset::write(reader, &path_str, Some(write_params))?; @@ -929,7 +941,6 @@ pub extern "system" fn Java_org_lance_Dataset_openNative<'local>( storage_options_obj: JObject, // Map serialized_manifest: JObject, // Optional storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw!( env, @@ -943,7 +954,6 @@ pub extern "system" fn Java_org_lance_Dataset_openNative<'local>( storage_options_obj, serialized_manifest, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj ) ) } @@ -959,7 +969,6 @@ fn inner_open_native<'local>( storage_options_obj: JObject, // Map serialized_manifest: JObject, // Optional storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let path_str: String = path.extract(env)?; let version = env.get_int_opt(&version_obj)?; @@ -994,35 +1003,6 @@ fn inner_open_native<'local>( let storage_options_provider_arc = storage_options_provider.map(|v| Arc::new(v) as Arc); - // Extract s3_credentials_refresh_offset_seconds - let s3_credentials_refresh_offset_seconds = - if !s3_credentials_refresh_offset_seconds_obj.is_null() { - let is_present = env - .call_method( - &s3_credentials_refresh_offset_seconds_obj, - "isPresent", - "()Z", - &[], - )? - .z()?; - if is_present { - let value = env - .call_method( - &s3_credentials_refresh_offset_seconds_obj, - "get", - "()Ljava/lang/Object;", - &[], - )? - .l()?; - let long_value = env.call_method(&value, "longValue", "()J", &[])?.j()?; - Some(long_value as u64) - } else { - None - } - } else { - None - }; - let serialized_manifest = env.get_bytes_opt(&serialized_manifest)?; let dataset = BlockingDataset::open( &path_str, @@ -1033,7 +1013,6 @@ fn inner_open_native<'local>( storage_options, serialized_manifest, storage_options_provider_arc, - s3_credentials_refresh_offset_seconds, )?; dataset.into_java(env) } @@ -1229,6 +1208,58 @@ fn inner_latest_version_id(env: &mut JNIEnv, java_dataset: JObject) -> Result( + mut env: JNIEnv<'local>, + java_dataset: JObject, +) -> JObject<'local> { + ok_or_throw!( + env, + inner_get_initial_storage_options(&mut env, java_dataset) + ) +} + +fn inner_get_initial_storage_options<'local>( + env: &mut JNIEnv<'local>, + java_dataset: JObject, +) -> Result> { + let storage_options = { + let dataset_guard = + unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; + dataset_guard.initial_storage_options() + }; + match storage_options { + Some(opts) => opts.into_java(env), + None => Ok(JObject::null()), + } +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_Dataset_nativeGetLatestStorageOptions<'local>( + mut env: JNIEnv<'local>, + java_dataset: JObject, +) -> JObject<'local> { + ok_or_throw!( + env, + inner_get_latest_storage_options(&mut env, java_dataset) + ) +} + +fn inner_get_latest_storage_options<'local>( + env: &mut JNIEnv<'local>, + java_dataset: JObject, +) -> Result> { + let storage_options = { + let dataset_guard = + unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; + dataset_guard.latest_storage_options()? + }; + match storage_options { + Some(opts) => opts.into_java(env), + None => Ok(JObject::null()), + } +} + #[no_mangle] pub extern "system" fn Java_org_lance_Dataset_nativeCheckoutLatest( mut env: JNIEnv, @@ -1358,20 +1389,16 @@ fn inner_shallow_clone<'local>( let new_ds = { let mut dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; - RT.block_on( - dataset_guard.inner.shallow_clone( - &target_path_str, - reference, - storage_options - .map(|options| { - Some(ObjectStoreParams { - storage_options: Some(options), - ..Default::default() - }) - }) - .unwrap_or(None), - ), - )? + RT.block_on(dataset_guard.inner.shallow_clone( + &target_path_str, + reference, + storage_options.map(|options| ObjectStoreParams { + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(options), + )), + ..Default::default() + }), + ))? }; BlockingDataset { inner: new_ds }.into_java(env) diff --git a/java/lance-jni/src/error.rs b/java/lance-jni/src/error.rs index 4e8f988120d..ef05b8cdb5c 100644 --- a/java/lance-jni/src/error.rs +++ b/java/lance-jni/src/error.rs @@ -6,6 +6,7 @@ use std::str::Utf8Error; use arrow_schema::ArrowError; use jni::{errors::Error as JniError, JNIEnv}; use lance::Error as LanceError; +use lance_namespace::error::NamespaceError; use serde_json::Error as JsonError; #[derive(Debug, PartialEq, Eq)] @@ -15,6 +16,7 @@ pub enum JavaExceptionClass { RuntimeException, UnsupportedOperationException, AlreadyInException, + LanceNamespaceException, } impl JavaExceptionClass { @@ -26,6 +28,7 @@ impl JavaExceptionClass { Self::UnsupportedOperationException => "java/lang/UnsupportedOperationException", // Included for display purposes. This is not a real exception. Self::AlreadyInException => "AlreadyInException", + Self::LanceNamespaceException => "org/lance/namespace/errors/LanceNamespaceException", } } } @@ -34,6 +37,7 @@ impl JavaExceptionClass { pub struct Error { message: String, java_class: JavaExceptionClass, + namespace_error_code: Option, } impl Error { @@ -41,6 +45,7 @@ impl Error { Self { message, java_class, + namespace_error_code: None, } } @@ -48,6 +53,7 @@ impl Error { Self { message, java_class: JavaExceptionClass::RuntimeException, + namespace_error_code: None, } } @@ -63,10 +69,19 @@ impl Error { Self::new(message, JavaExceptionClass::UnsupportedOperationException) } + pub fn namespace_error(code: u32, message: String) -> Self { + Self { + message, + java_class: JavaExceptionClass::LanceNamespaceException, + namespace_error_code: Some(code), + } + } + pub fn in_exception() -> Self { Self { message: String::default(), java_class: JavaExceptionClass::AlreadyInException, + namespace_error_code: None, } } @@ -75,11 +90,105 @@ impl Error { // An exception is already in progress, so we don't need to throw another one. return; } + + // For namespace errors, throw the specific LanceNamespaceException + if self.java_class == JavaExceptionClass::LanceNamespaceException { + if let Some(code) = self.namespace_error_code { + // Call LanceNamespaceException.fromCode static method + if self.throw_namespace_exception(env, code).is_err() { + // lance-namespace is bundled as a dependency, so the exception classes + // should always be available. Panic if they're not. + panic!( + "Failed to throw LanceNamespaceException (code={}). \ + org.lance.namespace.errors.LanceNamespaceException and ErrorCode classes \ + must be available in the classpath.", + code + ); + } + return; + } + } + if let Err(e) = env.throw_new(self.java_class.as_str(), &self.message) { eprintln!("Error when throwing Java exception: {:?}", e.to_string()); panic!("Error when throwing Java exception: {:?}", e); } } + + fn throw_namespace_exception( + &self, + env: &mut JNIEnv, + code: u32, + ) -> std::result::Result<(), ()> { + // Try to find and call the LanceNamespaceException constructor + // that takes ErrorCode and message + let class_name = "org/lance/namespace/errors/LanceNamespaceException"; + let error_code_class = "org/lance/namespace/errors/ErrorCode"; + + // Find the ErrorCode.fromCode method + let error_code_cls = env.find_class(error_code_class).map_err(|_| ())?; + let from_code_method = env + .get_static_method_id( + &error_code_cls, + "fromCode", + "(I)Lorg/lance/namespace/errors/ErrorCode;", + ) + .map_err(|_| ())?; + let error_code_obj = unsafe { + env.call_static_method_unchecked( + &error_code_cls, + from_code_method, + jni::signature::ReturnType::Object, + &[jni::sys::jvalue { + i: code as jni::sys::jint, + }], + ) + } + .map_err(|_| ())?; + + let error_code = match error_code_obj { + jni::objects::JValueGen::Object(obj) => obj, + _ => return Err(()), + }; + + // Find the LanceNamespaceException class + let exception_cls = env.find_class(class_name).map_err(|_| ())?; + + // Create message JString + let message_str = env.new_string(&self.message).map_err(|_| ())?; + + // Find constructor (ErrorCode, String) + let constructor = env + .get_method_id( + &exception_cls, + "", + "(Lorg/lance/namespace/errors/ErrorCode;Ljava/lang/String;)V", + ) + .map_err(|_| ())?; + + // Create the exception object + let exception_obj = unsafe { + env.new_object_unchecked( + &exception_cls, + constructor, + &[ + jni::sys::jvalue { + l: error_code.as_raw(), + }, + jni::sys::jvalue { + l: message_str.as_raw(), + }, + ], + ) + } + .map_err(|_| ())?; + + // Throw the exception + env.throw(jni::objects::JThrowable::from(exception_obj)) + .map_err(|_| ())?; + + Ok(()) + } } pub type Result = std::result::Result; @@ -92,7 +201,7 @@ impl std::fmt::Display for Error { impl From for Error { fn from(err: LanceError) -> Self { - match err { + match &err { LanceError::DatasetNotFound { .. } | LanceError::DatasetAlreadyExists { .. } | LanceError::CommitConflict { .. } @@ -100,6 +209,19 @@ impl From for Error { LanceError::IO { .. } => Self::io_error(err.to_string()), LanceError::NotSupported { .. } => Self::unsupported_error(err.to_string()), LanceError::NotFound { .. } => Self::io_error(err.to_string()), + LanceError::Namespace { source, .. } => { + // Try to downcast to NamespaceError and get the error code + if let Some(ns_err) = source.downcast_ref::() { + Self::namespace_error(ns_err.code().as_u32(), ns_err.to_string()) + } else { + log::warn!( + "Failed to downcast NamespaceError source, falling back to runtime error. \ + This may indicate a version mismatch. Source type: {:?}", + source + ); + Self::runtime_error(err.to_string()) + } + } _ => Self::runtime_error(err.to_string()), } } diff --git a/java/lance-jni/src/file_reader.rs b/java/lance-jni/src/file_reader.rs index 11591b3acea..ccaac121579 100644 --- a/java/lance-jni/src/file_reader.rs +++ b/java/lance-jni/src/file_reader.rs @@ -112,7 +112,9 @@ fn inner_open<'local>( let storage_options = to_rust_map(env, &jmap)?; let reader = RT.block_on(async move { let object_params = ObjectStoreParams { - storage_options: Some(storage_options), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )), ..Default::default() }; let (obj_store, path) = ObjectStore::from_uri_and_params( diff --git a/java/lance-jni/src/file_writer.rs b/java/lance-jni/src/file_writer.rs index 600d7de2845..ebc5b1c328b 100644 --- a/java/lance-jni/src/file_writer.rs +++ b/java/lance-jni/src/file_writer.rs @@ -94,7 +94,9 @@ fn inner_open<'local>( let writer = RT.block_on(async move { let object_params = ObjectStoreParams { - storage_options: Some(storage_options), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )), ..Default::default() }; let (obj_store, path) = ObjectStore::from_uri_and_params( diff --git a/java/lance-jni/src/fragment.rs b/java/lance-jni/src/fragment.rs index 775ad0d906d..72377413c26 100644 --- a/java/lance-jni/src/fragment.rs +++ b/java/lance-jni/src/fragment.rs @@ -91,7 +91,6 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiArray<'local>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw_with_return!( env, @@ -108,7 +107,6 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiArray<'local>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj ), JObject::default() ) @@ -128,7 +126,6 @@ fn inner_create_with_ffi_array<'local>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let c_array_ptr = arrow_array_addr as *mut FFI_ArrowArray; let c_schema_ptr = arrow_schema_addr as *mut FFI_ArrowSchema; @@ -154,7 +151,6 @@ fn inner_create_with_ffi_array<'local>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj, reader, ) } @@ -173,7 +169,6 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiStream<'a>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'a> { ok_or_throw_with_return!( env, @@ -189,7 +184,6 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiStream<'a>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj ), JObject::null() ) @@ -208,7 +202,6 @@ fn inner_create_with_ffi_stream<'local>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let stream_ptr = arrow_array_stream_addr as *mut FFI_ArrowArrayStream; let reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }?; @@ -224,7 +217,6 @@ fn inner_create_with_ffi_stream<'local>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj, reader, ) } @@ -241,7 +233,6 @@ fn create_fragment<'a>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional source: impl StreamingWriteSource, ) -> Result> { let path_str = dataset_uri.extract(env)?; @@ -256,7 +247,6 @@ fn create_fragment<'a>( &data_storage_version, &storage_options_obj, &storage_options_provider_obj, - &s3_credentials_refresh_offset_seconds_obj, )?; let fragments = RT.block_on(FileFragment::create_fragments( diff --git a/java/lance-jni/src/namespace.rs b/java/lance-jni/src/namespace.rs index d197c2b594b..b9db171c064 100644 --- a/java/lance-jni/src/namespace.rs +++ b/java/lance-jni/src/namespace.rs @@ -1,23 +1,121 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +use std::collections::HashMap; +use std::sync::Arc; + use bytes::Bytes; -use jni::objects::{JByteArray, JMap, JObject, JString}; +use jni::objects::{GlobalRef, JByteArray, JMap, JObject, JString, JValue}; use jni::sys::{jbyteArray, jlong, jstring}; use jni::JNIEnv; use lance_namespace::models::*; use lance_namespace::LanceNamespace as LanceNamespaceTrait; use lance_namespace_impls::{ - ConnectBuilder, DirectoryNamespace, DirectoryNamespaceBuilder, RestAdapter, RestAdapterConfig, - RestNamespace, RestNamespaceBuilder, + ConnectBuilder, DirectoryNamespace, DirectoryNamespaceBuilder, DynamicContextProvider, + OperationInfo, RestAdapter, RestAdapterConfig, RestNamespace, RestNamespaceBuilder, }; use serde::{Deserialize, Serialize}; -use std::sync::Arc; use crate::error::{Error, Result}; use crate::utils::to_rust_map; use crate::RT; +/// Java-implemented dynamic context provider. +/// +/// Wraps a Java object that implements the DynamicContextProvider interface. +pub struct JavaDynamicContextProvider { + java_provider: GlobalRef, + jvm: Arc, +} + +impl JavaDynamicContextProvider { + /// Create a new Java context provider wrapper. + pub fn new(env: &mut JNIEnv, java_provider: &JObject) -> Result { + let java_provider = env.new_global_ref(java_provider)?; + let jvm = Arc::new(env.get_java_vm()?); + Ok(Self { java_provider, jvm }) + } +} + +impl std::fmt::Debug for JavaDynamicContextProvider { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "JavaDynamicContextProvider") + } +} + +impl DynamicContextProvider for JavaDynamicContextProvider { + fn provide_context(&self, info: &OperationInfo) -> HashMap { + // Attach to JVM + let mut env = match self.jvm.attach_current_thread() { + Ok(env) => env, + Err(e) => { + log::error!("Failed to attach to JVM: {}", e); + return HashMap::new(); + } + }; + + // Create Java strings for parameters + let operation = match env.new_string(&info.operation) { + Ok(s) => s, + Err(e) => { + log::error!("Failed to create operation string: {}", e); + return HashMap::new(); + } + }; + + let object_id = match env.new_string(&info.object_id) { + Ok(s) => s, + Err(e) => { + log::error!("Failed to create object_id string: {}", e); + return HashMap::new(); + } + }; + + // Call provideContext(String, String) -> Map + let result = env.call_method( + &self.java_provider, + "provideContext", + "(Ljava/lang/String;Ljava/lang/String;)Ljava/util/Map;", + &[JValue::Object(&operation), JValue::Object(&object_id)], + ); + + match result { + Ok(jvalue) => match jvalue.l() { + Ok(obj) if !obj.is_null() => { + // Convert Java Map to Rust HashMap + convert_java_map_to_hashmap(&mut env, &obj).unwrap_or_default() + } + Ok(_) => HashMap::new(), + Err(e) => { + log::error!("provideContext did not return object: {}", e); + HashMap::new() + } + }, + Err(e) => { + log::error!("Failed to call provideContext: {}", e); + HashMap::new() + } + } + } +} + +fn convert_java_map_to_hashmap( + env: &mut JNIEnv, + map_obj: &JObject, +) -> Result> { + let jmap = JMap::from_env(env, map_obj)?; + let mut result = HashMap::new(); + + let mut iter = jmap.iter(env)?; + while let Some((key, value)) = iter.next(env)? { + let key_str: String = env.get_string(&JString::from(key))?.into(); + let value_str: String = env.get_string(&JString::from(value))?.into(); + result.insert(key_str, value_str); + } + + Ok(result) +} + /// Blocking wrapper for DirectoryNamespace pub struct BlockingDirectoryNamespace { pub(crate) inner: DirectoryNamespace, @@ -40,20 +138,47 @@ pub extern "system" fn Java_org_lance_namespace_DirectoryNamespace_createNative( ) -> jlong { ok_or_throw_with_return!( env, - create_directory_namespace_internal(&mut env, properties_map), + create_directory_namespace_internal(&mut env, properties_map, None), + 0 + ) +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_DirectoryNamespace_createNativeWithProvider( + mut env: JNIEnv, + _obj: JObject, + properties_map: JObject, + context_provider: JObject, +) -> jlong { + ok_or_throw_with_return!( + env, + create_directory_namespace_internal(&mut env, properties_map, Some(context_provider)), 0 ) } -fn create_directory_namespace_internal(env: &mut JNIEnv, properties_map: JObject) -> Result { +fn create_directory_namespace_internal( + env: &mut JNIEnv, + properties_map: JObject, + context_provider: Option, +) -> Result { // Convert Java HashMap to Rust HashMap let jmap = JMap::from_env(env, &properties_map)?; let properties = to_rust_map(env, &jmap)?; // Build DirectoryNamespace using builder - let builder = DirectoryNamespaceBuilder::from_properties(properties, None).map_err(|e| { - Error::runtime_error(format!("Failed to create DirectoryNamespaceBuilder: {}", e)) - })?; + let mut builder = + DirectoryNamespaceBuilder::from_properties(properties, None).map_err(|e| { + Error::runtime_error(format!("Failed to create DirectoryNamespaceBuilder: {}", e)) + })?; + + // Add context provider if provided + if let Some(provider_obj) = context_provider { + if !provider_obj.is_null() { + let java_provider = JavaDynamicContextProvider::new(env, &provider_obj)?; + builder = builder.context_provider(Arc::new(java_provider)); + } + } let namespace = RT .block_on(builder.build()) @@ -313,6 +438,7 @@ pub extern "system" fn Java_org_lance_namespace_DirectoryNamespace_createTableNa } #[no_mangle] +#[allow(deprecated)] pub extern "system" fn Java_org_lance_namespace_DirectoryNamespace_createEmptyTableNative( mut env: JNIEnv, _obj: JObject, @@ -329,6 +455,23 @@ pub extern "system" fn Java_org_lance_namespace_DirectoryNamespace_createEmptyTa .into_raw() } +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_DirectoryNamespace_declareTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.declare_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + #[no_mangle] pub extern "system" fn Java_org_lance_namespace_DirectoryNamespace_insertIntoTableNative( mut env: JNIEnv, @@ -519,21 +662,47 @@ pub extern "system" fn Java_org_lance_namespace_RestNamespace_createNative( ) -> jlong { ok_or_throw_with_return!( env, - create_rest_namespace_internal(&mut env, properties_map), + create_rest_namespace_internal(&mut env, properties_map, None), + 0 + ) +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_RestNamespace_createNativeWithProvider( + mut env: JNIEnv, + _obj: JObject, + properties_map: JObject, + context_provider: JObject, +) -> jlong { + ok_or_throw_with_return!( + env, + create_rest_namespace_internal(&mut env, properties_map, Some(context_provider)), 0 ) } -fn create_rest_namespace_internal(env: &mut JNIEnv, properties_map: JObject) -> Result { +fn create_rest_namespace_internal( + env: &mut JNIEnv, + properties_map: JObject, + context_provider: Option, +) -> Result { // Convert Java HashMap to Rust HashMap let jmap = JMap::from_env(env, &properties_map)?; let properties = to_rust_map(env, &jmap)?; // Build RestNamespace using builder - let builder = RestNamespaceBuilder::from_properties(properties).map_err(|e| { + let mut builder = RestNamespaceBuilder::from_properties(properties).map_err(|e| { Error::runtime_error(format!("Failed to create RestNamespaceBuilder: {}", e)) })?; + // Add context provider if provided + if let Some(provider_obj) = context_provider { + if !provider_obj.is_null() { + let java_provider = JavaDynamicContextProvider::new(env, &provider_obj)?; + builder = builder.context_provider(Arc::new(java_provider)); + } + } + let namespace = builder.build(); let blocking_namespace = BlockingRestNamespace { inner: namespace }; @@ -790,6 +959,7 @@ pub extern "system" fn Java_org_lance_namespace_RestNamespace_createTableNative( } #[no_mangle] +#[allow(deprecated)] pub extern "system" fn Java_org_lance_namespace_RestNamespace_createEmptyTableNative( mut env: JNIEnv, _obj: JObject, @@ -806,6 +976,23 @@ pub extern "system" fn Java_org_lance_namespace_RestNamespace_createEmptyTableNa .into_raw() } +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_RestNamespace_declareTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_rest_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.declare_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + #[no_mangle] pub extern "system" fn Java_org_lance_namespace_RestNamespace_insertIntoTableNative( mut env: JNIEnv, diff --git a/java/lance-jni/src/traits.rs b/java/lance-jni/src/traits.rs index 7da64d453c2..ebc53b1679a 100644 --- a/java/lance-jni/src/traits.rs +++ b/java/lance-jni/src/traits.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +use std::collections::HashMap; + use jni::objects::{JIntArray, JLongArray, JMap, JObject, JString, JValue, JValueGen}; use jni::JNIEnv; @@ -224,6 +226,26 @@ impl IntoJava for &String { } } +impl IntoJava for HashMap { + fn into_java<'a>(self, env: &mut JNIEnv<'a>) -> Result> { + let hash_map = env.new_object("java/util/HashMap", "()V", &[])?; + for (key, value) in self { + let java_key = env.new_string(&key)?; + let java_value = env.new_string(&value)?; + env.call_method( + &hash_map, + "put", + "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;", + &[ + JValueGen::Object(&java_key.into()), + JValueGen::Object(&java_value.into()), + ], + )?; + } + Ok(hash_map) + } +} + impl IntoJava for JLance> { fn into_java<'a>(self, env: &mut JNIEnv<'a>) -> Result> { let obj = match self.0 { diff --git a/java/lance-jni/src/transaction.rs b/java/lance-jni/src/transaction.rs index 32ffe3c99e0..d28ee7865a1 100644 --- a/java/lance-jni/src/transaction.rs +++ b/java/lance-jni/src/transaction.rs @@ -674,27 +674,49 @@ fn inner_commit_transaction<'local>( .call_method(&java_transaction, "writeParams", "()Ljava/util/Map;", &[])? .l()?; let write_param_jmap = JMap::from_env(env, &write_param_jobj)?; - let mut write_param = to_rust_map(env, &write_param_jmap)?; + let write_param = to_rust_map(env, &write_param_jmap)?; - // Extract s3_credentials_refresh_offset_seconds from write_param - let s3_credentials_refresh_offset = write_param - .remove("s3_credentials_refresh_offset_seconds") - .and_then(|v| v.parse::().ok()) - .map(std::time::Duration::from_secs) - .unwrap_or_else(|| std::time::Duration::from_secs(10)); - - // Get the Dataset's storage_options_provider - let storage_options_provider = { + // Get the Dataset's storage_options_accessor and merge with write_param + let storage_options_accessor = { let dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(&java_dataset, NATIVE_DATASET) }?; - dataset_guard.get_storage_options_provider() + let existing_accessor = dataset_guard.inner.storage_options_accessor(); + + // Merge write_param with existing accessor's initial options + match existing_accessor { + Some(accessor) => { + let mut merged = accessor + .initial_storage_options() + .cloned() + .unwrap_or_default(); + merged.extend(write_param); + if let Some(provider) = accessor.provider().cloned() { + Some(Arc::new( + lance::io::StorageOptionsAccessor::with_initial_and_provider( + merged, provider, + ), + )) + } else { + Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(merged), + )) + } + } + None => { + if !write_param.is_empty() { + Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(write_param), + )) + } else { + None + } + } + } }; - // Build ObjectStoreParams using write_param for storage_options and provider from Dataset + // Build ObjectStoreParams using the merged accessor let store_params = ObjectStoreParams { - storage_options: Some(write_param), - storage_options_provider, - s3_credentials_refresh_offset, + storage_options_accessor, ..Default::default() }; diff --git a/java/lance-jni/src/utils.rs b/java/lance-jni/src/utils.rs index dc6f1e6e60f..8a266791f10 100644 --- a/java/lance-jni/src/utils.rs +++ b/java/lance-jni/src/utils.rs @@ -48,7 +48,6 @@ pub fn extract_write_params( data_storage_version: &JObject, storage_options_obj: &JObject, storage_options_provider_obj: &JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: &JObject, // Optional ) -> Result { let mut write_params = WriteParams::default(); @@ -76,26 +75,31 @@ pub fn extract_write_params( extract_storage_options(env, storage_options_obj)?; // Extract storage options provider if present - let storage_options_provider = env.get_optional(storage_options_provider_obj, |env, obj| { - let provider_obj = env - .call_method(obj, "get", "()Ljava/lang/Object;", &[])? - .l()?; - JavaStorageOptionsProvider::new(env, provider_obj) - })?; - - let storage_options_provider_arc: Option> = - storage_options_provider.map(|v| Arc::new(v) as Arc); - - // Extract s3_credentials_refresh_offset_seconds if present - let s3_credentials_refresh_offset = env - .get_long_opt(s3_credentials_refresh_offset_seconds_obj)? - .map(|v| std::time::Duration::from_secs(v as u64)) - .unwrap_or_else(|| std::time::Duration::from_secs(10)); + let storage_options_provider: Option> = env + .get_optional(storage_options_provider_obj, |env, optional_obj| { + let provider_obj = env + .call_method(optional_obj, "get", "()Ljava/lang/Object;", &[])? + .l()?; + JavaStorageOptionsProvider::new(env, provider_obj) + })? + .map(|p| Arc::new(p) as Arc); + + // Create storage options accessor from storage_options and provider + let accessor = match (storage_options.is_empty(), storage_options_provider) { + (false, Some(provider)) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_initial_and_provider(storage_options, provider), + )), + (false, None) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )), + (true, Some(provider)) => Some(Arc::new(lance::io::StorageOptionsAccessor::with_provider( + provider, + ))), + (true, None) => None, + }; write_params.store_params = Some(ObjectStoreParams { - storage_options: Some(storage_options), - storage_options_provider: storage_options_provider_arc, - s3_credentials_refresh_offset, + storage_options_accessor: accessor, ..Default::default() }); Ok(write_params) diff --git a/java/pom.xml b/java/pom.xml index ad757fa8a10..20fa8a767b1 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -108,12 +108,12 @@ org.lance lance-namespace-core - 0.2.1 + 0.4.5 org.lance lance-namespace-apache-client - 0.2.1 + 0.4.5 com.fasterxml.jackson.core diff --git a/java/src/main/java/org/lance/Dataset.java b/java/src/main/java/org/lance/Dataset.java index 21572214eda..0249732eb00 100644 --- a/java/src/main/java/org/lance/Dataset.java +++ b/java/src/main/java/org/lance/Dataset.java @@ -139,8 +139,7 @@ public static Dataset create( params.getMode(), params.getEnableStableRowIds(), params.getDataStorageVersion(), - params.getStorageOptions(), - params.getS3CredentialsRefreshOffsetSeconds()); + params.getStorageOptions()); dataset.allocator = allocator; return dataset; } @@ -198,8 +197,7 @@ static Dataset create( params.getEnableStableRowIds(), params.getDataStorageVersion(), params.getStorageOptions(), - Optional.ofNullable(storageOptionsProvider), - params.getS3CredentialsRefreshOffsetSeconds()); + Optional.ofNullable(storageOptionsProvider)); dataset.allocator = allocator; return dataset; } @@ -213,8 +211,7 @@ private static native Dataset createWithFfiSchema( Optional mode, Optional enableStableRowIds, Optional dataStorageVersion, - Map storageOptions, - Optional s3CredentialsRefreshOffsetSeconds); + Map storageOptions); private static native Dataset createWithFfiStream( long arrowStreamMemoryAddress, @@ -225,8 +222,7 @@ private static native Dataset createWithFfiStream( Optional mode, Optional enableStableRowIds, Optional dataStorageVersion, - Map storageOptions, - Optional s3CredentialsRefreshOffsetSeconds); + Map storageOptions); private static native Dataset createWithFfiStreamAndProvider( long arrowStreamMemoryAddress, @@ -238,8 +234,7 @@ private static native Dataset createWithFfiStreamAndProvider( Optional enableStableRowIds, Optional dataStorageVersion, Map storageOptions, - Optional storageOptionsProvider, - Optional s3CredentialsRefreshOffsetSeconds); + Optional storageOptionsProvider); /** * Open a dataset from the specified path. @@ -317,8 +312,7 @@ static Dataset open( options.getMetadataCacheSizeBytes(), options.getStorageOptions(), options.getSerializedManifest(), - options.getStorageOptionsProvider(), - options.getS3CredentialsRefreshOffsetSeconds()); + options.getStorageOptionsProvider()); dataset.allocator = allocator; dataset.selfManagedAllocator = selfManagedAllocator; return dataset; @@ -332,8 +326,7 @@ private static native Dataset openNative( long metadataCacheSizeBytes, Map storageOptions, Optional serializedManifest, - Optional storageOptionsProvider, - Optional s3CredentialsRefreshOffsetSeconds); + Optional storageOptionsProvider); /** * Creates a builder for opening a dataset. @@ -686,6 +679,42 @@ public long latestVersion() { private native long nativeGetLatestVersionId(); + /** + * Get the initial storage options used to open this dataset. + * + *

This returns the options that were provided when the dataset was opened, without any refresh + * from the provider. Returns null if no storage options were provided. + * + * @return the initial storage options, or null if none were provided + */ + public Map getInitialStorageOptions() { + try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) { + Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed"); + return nativeGetInitialStorageOptions(); + } + } + + private native Map nativeGetInitialStorageOptions(); + + /** + * Get the latest storage options, potentially refreshed from the provider. + * + *

If a storage options provider was configured and credentials are expiring, this will refresh + * them. + * + * @return the latest storage options (static or refreshed from provider), or null if no storage + * options were configured for this dataset + * @throws RuntimeException if an error occurs while fetching/refreshing options from the provider + */ + public Map getLatestStorageOptions() { + try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) { + Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed"); + return nativeGetLatestStorageOptions(); + } + } + + private native Map nativeGetLatestStorageOptions(); + /** Checkout the dataset to the latest version. */ public void checkoutLatest() { try (LockManager.WriteLock writeLock = lockManager.acquireWriteLock()) { diff --git a/java/src/main/java/org/lance/Fragment.java b/java/src/main/java/org/lance/Fragment.java index 812fb49548c..1454fea5476 100644 --- a/java/src/main/java/org/lance/Fragment.java +++ b/java/src/main/java/org/lance/Fragment.java @@ -209,7 +209,6 @@ private native FragmentUpdateResult nativeUpdateColumns( * .allocator(allocator) * .data(vectorSchemaRoot) * .storageOptions(storageOptions) - * .s3CredentialsRefreshOffsetSeconds(10) * .execute(); * } * @@ -275,8 +274,7 @@ public static List create( params.getEnableStableRowIds(), params.getDataStorageVersion(), params.getStorageOptions(), - Optional.ofNullable(storageOptionsProvider), - params.getS3CredentialsRefreshOffsetSeconds()); + Optional.ofNullable(storageOptionsProvider)); } } @@ -328,8 +326,7 @@ public static List create( params.getEnableStableRowIds(), params.getDataStorageVersion(), params.getStorageOptions(), - Optional.ofNullable(storageOptionsProvider), - params.getS3CredentialsRefreshOffsetSeconds()); + Optional.ofNullable(storageOptionsProvider)); } /** @@ -348,8 +345,7 @@ private static native List createWithFfiArray( Optional enableStableRowIds, Optional dataStorageVersion, Map storageOptions, - Optional storageOptionsProvider, - Optional s3CredentialsRefreshOffsetSeconds); + Optional storageOptionsProvider); /** * Create a fragment from the given arrow stream. @@ -366,6 +362,5 @@ private static native List createWithFfiStream( Optional enableStableRowIds, Optional dataStorageVersion, Map storageOptions, - Optional storageOptionsProvider, - Optional s3CredentialsRefreshOffsetSeconds); + Optional storageOptionsProvider); } diff --git a/java/src/main/java/org/lance/OpenDatasetBuilder.java b/java/src/main/java/org/lance/OpenDatasetBuilder.java index ae082e14ceb..fc350fb0fcf 100644 --- a/java/src/main/java/org/lance/OpenDatasetBuilder.java +++ b/java/src/main/java/org/lance/OpenDatasetBuilder.java @@ -58,7 +58,6 @@ public class OpenDatasetBuilder { private LanceNamespace namespace; private List tableId; private ReadOptions options = new ReadOptions.Builder().build(); - private boolean ignoreNamespaceTableStorageOptions = false; /** Creates a new builder instance. Package-private, use Dataset.open() instead. */ OpenDatasetBuilder() {} @@ -128,19 +127,6 @@ public OpenDatasetBuilder readOptions(ReadOptions options) { return this; } - /** - * Sets whether to ignore storage options from the namespace's describeTable(). - * - * @param ignoreNamespaceTableStorageOptions If true, storage options returned from - * describeTable() will be ignored (treated as null) - * @return this builder instance - */ - public OpenDatasetBuilder ignoreNamespaceTableStorageOptions( - boolean ignoreNamespaceTableStorageOptions) { - this.ignoreNamespaceTableStorageOptions = ignoreNamespaceTableStorageOptions; - return this; - } - /** * Opens the dataset with the configured parameters. * @@ -204,8 +190,7 @@ private Dataset buildFromNamespace() { throw new IllegalArgumentException("Namespace did not return a table location"); } - Map namespaceStorageOptions = - ignoreNamespaceTableStorageOptions ? null : response.getStorageOptions(); + Map namespaceStorageOptions = response.getStorageOptions(); ReadOptions.Builder optionsBuilder = new ReadOptions.Builder() @@ -221,9 +206,6 @@ private Dataset buildFromNamespace() { options.getVersion().ifPresent(optionsBuilder::setVersion); options.getBlockSize().ifPresent(optionsBuilder::setBlockSize); options.getSerializedManifest().ifPresent(optionsBuilder::setSerializedManifest); - options - .getS3CredentialsRefreshOffsetSeconds() - .ifPresent(optionsBuilder::setS3CredentialsRefreshOffsetSeconds); Map storageOptions = new HashMap<>(options.getStorageOptions()); if (namespaceStorageOptions != null) { diff --git a/java/src/main/java/org/lance/ReadOptions.java b/java/src/main/java/org/lance/ReadOptions.java index 9d08c834008..0a7a0343a79 100644 --- a/java/src/main/java/org/lance/ReadOptions.java +++ b/java/src/main/java/org/lance/ReadOptions.java @@ -32,7 +32,6 @@ public class ReadOptions { private final Optional serializedManifest; private final Map storageOptions; private final Optional storageOptionsProvider; - private final Optional s3CredentialsRefreshOffsetSeconds; private ReadOptions(Builder builder) { this.version = builder.version; @@ -42,7 +41,6 @@ private ReadOptions(Builder builder) { this.storageOptions = builder.storageOptions; this.serializedManifest = builder.serializedManifest; this.storageOptionsProvider = builder.storageOptionsProvider; - this.s3CredentialsRefreshOffsetSeconds = builder.s3CredentialsRefreshOffsetSeconds; } public Optional getVersion() { @@ -73,10 +71,6 @@ public Optional getStorageOptionsProvider() { return storageOptionsProvider; } - public Optional getS3CredentialsRefreshOffsetSeconds() { - return s3CredentialsRefreshOffsetSeconds; - } - @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -100,7 +94,6 @@ public static class Builder { private Map storageOptions = new HashMap<>(); private Optional serializedManifest = Optional.empty(); private Optional storageOptionsProvider = Optional.empty(); - private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); /** * Set the version of the dataset to read. If not set, read from latest version. @@ -221,22 +214,6 @@ public Builder setStorageOptionsProvider(StorageOptionsProvider storageOptionsPr return this; } - /** - * Set the number of seconds before credential expiration to trigger a refresh. - * - *

Default is 60 seconds. Only applicable when using AWS S3 with temporary credentials. For - * example, if set to 60, credentials will be refreshed when they have less than 60 seconds - * remaining before expiration. This should be set shorter than the credential lifetime to avoid - * using expired credentials. - * - * @param s3CredentialsRefreshOffsetSeconds the refresh offset in seconds - * @return this builder - */ - public Builder setS3CredentialsRefreshOffsetSeconds(long s3CredentialsRefreshOffsetSeconds) { - this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); - return this; - } - public ReadOptions build() { return new ReadOptions(this); } diff --git a/java/src/main/java/org/lance/Transaction.java b/java/src/main/java/org/lance/Transaction.java index 67bc5f8d93d..2d565c73258 100644 --- a/java/src/main/java/org/lance/Transaction.java +++ b/java/src/main/java/org/lance/Transaction.java @@ -118,7 +118,6 @@ public static class Builder { private Operation operation; private Map writeParams; private Map transactionProperties; - private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); public Builder(Dataset dataset) { this.dataset = dataset; @@ -140,21 +139,6 @@ public Builder writeParams(Map writeParams) { return this; } - /** - * Sets the S3 credentials refresh offset in seconds. - * - *

This parameter controls how long before credential expiration to refresh them. For - * example, if credentials expire at T+60s and this is set to 10, credentials will be refreshed - * at T+50s. - * - * @param s3CredentialsRefreshOffsetSeconds Refresh offset in seconds - * @return this builder instance - */ - public Builder s3CredentialsRefreshOffsetSeconds(long s3CredentialsRefreshOffsetSeconds) { - this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); - return this; - } - public Builder operation(Operation operation) { validateState(); this.operation = operation; @@ -171,15 +155,8 @@ private void validateState() { public Transaction build() { Preconditions.checkState(operation != null, "TransactionBuilder has no operations"); - // Merge s3_credentials_refresh_offset_seconds into writeParams if present - Map finalWriteParams = - writeParams != null ? new HashMap<>(writeParams) : new HashMap<>(); - s3CredentialsRefreshOffsetSeconds.ifPresent( - value -> - finalWriteParams.put("s3_credentials_refresh_offset_seconds", String.valueOf(value))); - return new Transaction( - dataset, readVersion, uuid, operation, finalWriteParams, transactionProperties); + dataset, readVersion, uuid, operation, writeParams, transactionProperties); } } } diff --git a/java/src/main/java/org/lance/WriteDatasetBuilder.java b/java/src/main/java/org/lance/WriteDatasetBuilder.java index 74f8c298fe8..01f6fcbb80b 100644 --- a/java/src/main/java/org/lance/WriteDatasetBuilder.java +++ b/java/src/main/java/org/lance/WriteDatasetBuilder.java @@ -18,6 +18,8 @@ import org.lance.namespace.LanceNamespaceStorageOptionsProvider; import org.lance.namespace.model.CreateEmptyTableRequest; import org.lance.namespace.model.CreateEmptyTableResponse; +import org.lance.namespace.model.DeclareTableRequest; +import org.lance.namespace.model.DeclareTableResponse; import org.lance.namespace.model.DescribeTableRequest; import org.lance.namespace.model.DescribeTableResponse; @@ -78,7 +80,6 @@ public class WriteDatasetBuilder { private Optional maxBytesPerFile = Optional.empty(); private Optional enableStableRowIds = Optional.empty(); private Optional dataStorageVersion = Optional.empty(); - private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); /** Creates a new builder instance. Package-private, use Dataset.write() instead. */ WriteDatasetBuilder() { @@ -272,21 +273,6 @@ public WriteDatasetBuilder dataStorageVersion(WriteParams.LanceFileVersion dataS return this; } - /** - * Sets the S3 credentials refresh offset in seconds. - * - *

This parameter controls how long before credential expiration to refresh them. For example, - * if credentials expire at T+60s and this is set to 10, credentials will be refreshed at T+50s. - * - * @param s3CredentialsRefreshOffsetSeconds Refresh offset in seconds - * @return this builder instance - */ - public WriteDatasetBuilder s3CredentialsRefreshOffsetSeconds( - long s3CredentialsRefreshOffsetSeconds) { - this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); - return this; - } - /** * Executes the write operation and returns the created dataset. * @@ -353,18 +339,33 @@ private Dataset executeWithNamespace() { // Mode-specific namespace operations if (mode == WriteParams.WriteMode.CREATE) { - // Call namespace.createEmptyTable() to create new table - CreateEmptyTableRequest request = new CreateEmptyTableRequest(); - request.setId(tableId); - - CreateEmptyTableResponse response = namespace.createEmptyTable(request); + // Try declareTable first, fall back to deprecated createEmptyTable + // for backward compatibility with older namespace implementations. + // createEmptyTable support will be removed in 3.0.0. + String location; + Map responseStorageOptions; + + try { + DeclareTableRequest declareRequest = new DeclareTableRequest(); + declareRequest.setId(tableId); + DeclareTableResponse declareResponse = namespace.declareTable(declareRequest); + location = declareResponse.getLocation(); + responseStorageOptions = declareResponse.getStorageOptions(); + } catch (UnsupportedOperationException e) { + // Fall back to deprecated createEmptyTable + CreateEmptyTableRequest fallbackRequest = new CreateEmptyTableRequest(); + fallbackRequest.setId(tableId); + CreateEmptyTableResponse fallbackResponse = namespace.createEmptyTable(fallbackRequest); + location = fallbackResponse.getLocation(); + responseStorageOptions = fallbackResponse.getStorageOptions(); + } - tableUri = response.getLocation(); + tableUri = location; if (tableUri == null || tableUri.isEmpty()) { throw new IllegalArgumentException("Namespace did not return a table location"); } - namespaceStorageOptions = ignoreNamespaceStorageOptions ? null : response.getStorageOptions(); + namespaceStorageOptions = ignoreNamespaceStorageOptions ? null : responseStorageOptions; } else { // For APPEND/OVERWRITE modes, call namespace.describeTable() DescribeTableRequest request = new DescribeTableRequest(); @@ -395,8 +396,6 @@ private Dataset executeWithNamespace() { maxBytesPerFile.ifPresent(paramsBuilder::withMaxBytesPerFile); enableStableRowIds.ifPresent(paramsBuilder::withEnableStableRowIds); dataStorageVersion.ifPresent(paramsBuilder::withDataStorageVersion); - s3CredentialsRefreshOffsetSeconds.ifPresent( - paramsBuilder::withS3CredentialsRefreshOffsetSeconds); WriteParams params = paramsBuilder.build(); @@ -419,8 +418,6 @@ private Dataset executeWithUri() { maxBytesPerFile.ifPresent(paramsBuilder::withMaxBytesPerFile); enableStableRowIds.ifPresent(paramsBuilder::withEnableStableRowIds); dataStorageVersion.ifPresent(paramsBuilder::withDataStorageVersion); - s3CredentialsRefreshOffsetSeconds.ifPresent( - paramsBuilder::withS3CredentialsRefreshOffsetSeconds); WriteParams params = paramsBuilder.build(); diff --git a/java/src/main/java/org/lance/WriteFragmentBuilder.java b/java/src/main/java/org/lance/WriteFragmentBuilder.java index 76882b14a29..56ce06a7b0a 100644 --- a/java/src/main/java/org/lance/WriteFragmentBuilder.java +++ b/java/src/main/java/org/lance/WriteFragmentBuilder.java @@ -37,7 +37,6 @@ * .allocator(allocator) * .data(vectorSchemaRoot) * .storageOptions(storageOptions) - * .s3CredentialsRefreshOffsetSeconds(10) * .execute(); * } */ @@ -134,21 +133,6 @@ public WriteFragmentBuilder storageOptionsProvider(StorageOptionsProvider provid return this; } - /** - * Set the S3 credentials refresh offset in seconds. - * - *

This parameter controls how long before credential expiration to refresh them. For example, - * if credentials expire at T+60s and this is set to 10, credentials will be refreshed at T+50s. - * - * @param seconds refresh offset in seconds - * @return this builder - */ - public WriteFragmentBuilder s3CredentialsRefreshOffsetSeconds(long seconds) { - ensureWriteParamsBuilder(); - this.writeParamsBuilder.withS3CredentialsRefreshOffsetSeconds(seconds); - return this; - } - /** * Set the maximum number of rows per file. * diff --git a/java/src/main/java/org/lance/WriteParams.java b/java/src/main/java/org/lance/WriteParams.java index a0ce1c8c375..1b5a2dceeb9 100644 --- a/java/src/main/java/org/lance/WriteParams.java +++ b/java/src/main/java/org/lance/WriteParams.java @@ -56,7 +56,6 @@ public String getVersionString() { private final Optional enableStableRowIds; private final Optional dataStorageVersion; private Map storageOptions = new HashMap<>(); - private final Optional s3CredentialsRefreshOffsetSeconds; private WriteParams( Optional maxRowsPerFile, @@ -65,8 +64,7 @@ private WriteParams( Optional mode, Optional enableStableRowIds, Optional dataStorageVersion, - Map storageOptions, - Optional s3CredentialsRefreshOffsetSeconds) { + Map storageOptions) { this.maxRowsPerFile = maxRowsPerFile; this.maxRowsPerGroup = maxRowsPerGroup; this.maxBytesPerFile = maxBytesPerFile; @@ -74,7 +72,6 @@ private WriteParams( this.enableStableRowIds = enableStableRowIds; this.dataStorageVersion = dataStorageVersion; this.storageOptions = storageOptions; - this.s3CredentialsRefreshOffsetSeconds = s3CredentialsRefreshOffsetSeconds; } public Optional getMaxRowsPerFile() { @@ -110,10 +107,6 @@ public Map getStorageOptions() { return storageOptions; } - public Optional getS3CredentialsRefreshOffsetSeconds() { - return s3CredentialsRefreshOffsetSeconds; - } - @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -134,7 +127,6 @@ public static class Builder { private Optional enableStableRowIds = Optional.empty(); private Optional dataStorageVersion = Optional.empty(); private Map storageOptions = new HashMap<>(); - private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); public Builder withMaxRowsPerFile(int maxRowsPerFile) { this.maxRowsPerFile = Optional.of(maxRowsPerFile); @@ -171,11 +163,6 @@ public Builder withEnableStableRowIds(boolean enableStableRowIds) { return this; } - public Builder withS3CredentialsRefreshOffsetSeconds(long s3CredentialsRefreshOffsetSeconds) { - this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); - return this; - } - public WriteParams build() { return new WriteParams( maxRowsPerFile, @@ -184,8 +171,7 @@ public WriteParams build() { mode, enableStableRowIds, dataStorageVersion, - storageOptions, - s3CredentialsRefreshOffsetSeconds); + storageOptions); } } } diff --git a/java/src/main/java/org/lance/namespace/DirectoryNamespace.java b/java/src/main/java/org/lance/namespace/DirectoryNamespace.java index 19de6d0a4bf..3ffe2b82f01 100644 --- a/java/src/main/java/org/lance/namespace/DirectoryNamespace.java +++ b/java/src/main/java/org/lance/namespace/DirectoryNamespace.java @@ -21,7 +21,10 @@ import org.apache.arrow.memory.BufferAllocator; import java.io.Closeable; +import java.lang.reflect.Constructor; +import java.util.HashMap; import java.util.Map; +import java.util.Optional; /** * DirectoryNamespace implementation that provides Lance namespace functionality for directory-based @@ -51,6 +54,43 @@ * for S3, storage.account_name=myaccount for Azure) * * + *

Credential vending properties (requires credential-vendor-* features to be enabled): + * + *

When credential vendor properties are configured, describeTable() will return vended temporary + * credentials. The vendor type is auto-selected based on the table location URI: s3:// for AWS, + * gs:// for GCP, az:// for Azure. + * + *

    + *
  • Common properties: + *
      + *
    • credential_vendor.enabled (required): Set to "true" to enable credential vending + *
    • credential_vendor.permission (optional): read, write, or admin (default: read) + *
    + *
  • AWS-specific properties (for s3:// locations): + *
      + *
    • credential_vendor.aws_role_arn (required): IAM role ARN to assume + *
    • credential_vendor.aws_external_id (optional): External ID for assume role + *
    • credential_vendor.aws_region (optional): AWS region + *
    • credential_vendor.aws_role_session_name (optional): Role session name + *
    • credential_vendor.aws_duration_millis (optional): Duration in ms (default: 3600000, + * range: 15min-12hrs) + *
    + *
  • GCP-specific properties (for gs:// locations): + *
      + *
    • credential_vendor.gcp_service_account (optional): Service account to impersonate + *
    • Note: GCP uses Application Default Credentials (ADC). To use a service account key + * file, set the GOOGLE_APPLICATION_CREDENTIALS environment variable before starting. + *
    • Note: GCP token duration cannot be configured; it's determined by the STS endpoint + *
    + *
  • Azure-specific properties (for az:// locations): + *
      + *
    • credential_vendor.azure_account_name (required): Azure storage account name + *
    • credential_vendor.azure_tenant_id (optional): Azure tenant ID + *
    • credential_vendor.azure_duration_millis (optional): Duration in ms (default: 3600000, + * up to 7 days) + *
    + *
+ * *

Example usage (local filesystem): * *

{@code
@@ -81,6 +121,21 @@
  * // Use namespace...
  * namespace.close();
  * }
+ * + *

Example usage (AWS S3 with credential vending): + * + *

{@code
+ * Map properties = new HashMap<>();
+ * properties.put("root", "s3://my-bucket/lance-data");
+ * properties.put("credential_vendor.enabled", "true");
+ * properties.put("credential_vendor.aws_role_arn", "arn:aws:iam::123456789012:role/MyRole");
+ * properties.put("credential_vendor.aws_duration_millis", "3600000");  // 1 hour
+ *
+ * DirectoryNamespace namespace = new DirectoryNamespace();
+ * namespace.initialize(properties, allocator);
+ * // describeTable() will now return vended credentials (AWS vendor auto-selected from s3:// URI)
+ * namespace.close();
+ * }
*/ public class DirectoryNamespace implements LanceNamespace, Closeable { static { @@ -97,11 +152,43 @@ public DirectoryNamespace() {} @Override public void initialize(Map configProperties, BufferAllocator allocator) { + initialize(configProperties, allocator, null); + } + + /** + * Initialize with a dynamic context provider. + * + *

If contextProvider is null and the properties contain {@code dynamic_context_provider.impl}, + * the provider will be loaded from the class path. The class must implement {@link + * DynamicContextProvider} and have a constructor accepting {@code Map}. + * + * @param configProperties Configuration properties for the namespace + * @param allocator Arrow buffer allocator + * @param contextProvider Optional provider for per-request context (e.g., dynamic auth headers) + */ + public void initialize( + Map configProperties, + BufferAllocator allocator, + DynamicContextProvider contextProvider) { if (this.nativeDirectoryNamespaceHandle != 0) { throw new IllegalStateException("DirectoryNamespace already initialized"); } this.allocator = allocator; - this.nativeDirectoryNamespaceHandle = createNative(configProperties); + + // If no explicit provider, try to create from properties + DynamicContextProvider provider = contextProvider; + if (provider == null) { + provider = createProviderFromProperties(configProperties).orElse(null); + } + + // Filter out provider properties before passing to native layer + Map filteredProperties = filterProviderProperties(configProperties); + + if (provider != null) { + this.nativeDirectoryNamespaceHandle = createNativeWithProvider(filteredProperties, provider); + } else { + this.nativeDirectoryNamespaceHandle = createNative(filteredProperties); + } } @Override @@ -220,6 +307,14 @@ public CreateEmptyTableResponse createEmptyTable(CreateEmptyTableRequest request return fromJson(responseJson, CreateEmptyTableResponse.class); } + @Override + public DeclareTableResponse declareTable(DeclareTableRequest request) { + ensureInitialized(); + String requestJson = toJson(request); + String responseJson = declareTableNative(nativeDirectoryNamespaceHandle, requestJson); + return fromJson(responseJson, DeclareTableResponse.class); + } + @Override public InsertIntoTableResponse insertIntoTable( InsertIntoTableRequest request, byte[] requestData) { @@ -339,6 +434,9 @@ private static T fromJson(String json, Class clazz) { // Native methods private native long createNative(Map properties); + private native long createNativeWithProvider( + Map properties, DynamicContextProvider contextProvider); + private native void releaseNative(long handle); private native String namespaceIdNative(long handle); @@ -371,6 +469,8 @@ private static T fromJson(String json, Class clazz) { private native String createEmptyTableNative(long handle, String requestJson); + private native String declareTableNative(long handle, String requestJson); + private native String insertIntoTableNative(long handle, String requestJson, byte[] requestData); private native String mergeInsertIntoTableNative( @@ -391,4 +491,77 @@ private native String mergeInsertIntoTableNative( private native String describeTransactionNative(long handle, String requestJson); private native String alterTransactionNative(long handle, String requestJson); + + // ========================================================================== + // Provider loading helpers + // ========================================================================== + + private static final String PROVIDER_PREFIX = "dynamic_context_provider."; + private static final String IMPL_KEY = "dynamic_context_provider.impl"; + + /** + * Create a context provider from properties if configured. + * + *

Loads the class specified by {@code dynamic_context_provider.impl} from the class path and + * instantiates it with the extracted provider properties. + */ + private static Optional createProviderFromProperties( + Map properties) { + String className = properties.get(IMPL_KEY); + if (className == null || className.isEmpty()) { + return Optional.empty(); + } + + // Extract provider-specific properties (strip prefix, exclude impl key) + Map providerProps = new HashMap<>(); + for (Map.Entry entry : properties.entrySet()) { + String key = entry.getKey(); + if (key.startsWith(PROVIDER_PREFIX) && !key.equals(IMPL_KEY)) { + String propName = key.substring(PROVIDER_PREFIX.length()); + providerProps.put(propName, entry.getValue()); + } + } + + try { + Class providerClass = Class.forName(className); + if (!DynamicContextProvider.class.isAssignableFrom(providerClass)) { + throw new IllegalArgumentException( + String.format( + "Class '%s' does not implement DynamicContextProvider interface", className)); + } + + @SuppressWarnings("unchecked") + Class typedClass = + (Class) providerClass; + + Constructor constructor = + typedClass.getConstructor(Map.class); + return Optional.of(constructor.newInstance(providerProps)); + + } catch (ClassNotFoundException e) { + throw new IllegalArgumentException( + String.format("Failed to load context provider class '%s': %s", className, e), e); + } catch (NoSuchMethodException e) { + throw new IllegalArgumentException( + String.format( + "Context provider class '%s' must have a public constructor " + + "that accepts Map", + className), + e); + } catch (ReflectiveOperationException e) { + throw new IllegalArgumentException( + String.format("Failed to instantiate context provider '%s': %s", className, e), e); + } + } + + /** Filter out dynamic_context_provider.* properties from the map. */ + private static Map filterProviderProperties(Map properties) { + Map filtered = new HashMap<>(); + for (Map.Entry entry : properties.entrySet()) { + if (!entry.getKey().startsWith(PROVIDER_PREFIX)) { + filtered.put(entry.getKey(), entry.getValue()); + } + } + return filtered; + } } diff --git a/java/src/main/java/org/lance/namespace/DynamicContextProvider.java b/java/src/main/java/org/lance/namespace/DynamicContextProvider.java new file mode 100644 index 00000000000..77b10c892a4 --- /dev/null +++ b/java/src/main/java/org/lance/namespace/DynamicContextProvider.java @@ -0,0 +1,77 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.lance.namespace; + +import java.util.Map; + +/** + * Interface for providing dynamic per-request context to namespace operations. + * + *

Implementations can generate per-request context (e.g., authentication headers) based on the + * operation being performed. The provider is called synchronously before each namespace operation. + * + *

For RestNamespace, context keys that start with {@code headers.} are converted to HTTP headers + * by stripping the prefix. For example, {@code {"headers.Authorization": "Bearer abc123"}} becomes + * the {@code Authorization: Bearer abc123} header. Keys without the {@code headers.} prefix are + * ignored for HTTP headers but may be used for other purposes. + * + *

Example implementation: + * + *

+ * public class MyContextProvider implements DynamicContextProvider {
+ *   @Override
+ *   public Map<String, String> provideContext(String operation, String objectId) {
+ *     Map<String, String> context = new HashMap<>();
+ *     context.put("headers.Authorization", "Bearer " + getAuthToken());
+ *     context.put("headers.X-Request-Id", UUID.randomUUID().toString());
+ *     return context;
+ *   }
+ * }
+ * 
+ * + *

Usage with DirectoryNamespace: + * + *

+ * DynamicContextProvider provider = new MyContextProvider();
+ * Map<String, String> properties = Map.of("root", "/path/to/data");
+ * DirectoryNamespace namespace = new DirectoryNamespace();
+ * namespace.initialize(properties, allocator, provider);
+ * 
+ * + *

Usage with RestNamespace: + * + *

+ * DynamicContextProvider provider = new MyContextProvider();
+ * Map<String, String> properties = Map.of("uri", "https://api.example.com");
+ * RestNamespace namespace = new RestNamespace();
+ * namespace.initialize(properties, provider);
+ * 
+ */ +public interface DynamicContextProvider { + + /** + * Provide context for a namespace operation. + * + *

This method is called synchronously before each namespace operation. Implementations should + * be thread-safe as multiple operations may be performed concurrently. + * + * @param operation The operation name (e.g., "list_tables", "describe_table", "create_namespace") + * @param objectId The object identifier (namespace or table ID in delimited form, e.g., + * "workspace$table_name") + * @return Map of context key-value pairs. For HTTP headers, use keys with the "headers." prefix + * (e.g., "headers.Authorization"). Return an empty map if no additional context is needed. + * Must not return null. + */ + Map provideContext(String operation, String objectId); +} diff --git a/java/src/main/java/org/lance/namespace/RestNamespace.java b/java/src/main/java/org/lance/namespace/RestNamespace.java index 995c53c4b92..840e9f3d690 100644 --- a/java/src/main/java/org/lance/namespace/RestNamespace.java +++ b/java/src/main/java/org/lance/namespace/RestNamespace.java @@ -21,7 +21,10 @@ import org.apache.arrow.memory.BufferAllocator; import java.io.Closeable; +import java.lang.reflect.Constructor; +import java.util.HashMap; import java.util.Map; +import java.util.Optional; /** * RestNamespace implementation that provides Lance namespace functionality via REST API endpoints. @@ -74,11 +77,47 @@ public RestNamespace() {} @Override public void initialize(Map configProperties, BufferAllocator allocator) { + initialize(configProperties, allocator, null); + } + + /** + * Initialize with a dynamic context provider. + * + *

The context provider is called before each namespace operation and can return per-request + * context (e.g., authentication headers). Context keys that start with {@code headers.} are + * converted to HTTP headers by stripping the prefix. + * + *

If contextProvider is null and the properties contain {@code dynamic_context_provider.impl}, + * the provider will be loaded from the class path. The class must implement {@link + * DynamicContextProvider} and have a constructor accepting {@code Map}. + * + * @param configProperties Configuration properties for the namespace + * @param allocator Arrow buffer allocator + * @param contextProvider Optional provider for per-request context (e.g., dynamic auth headers) + */ + public void initialize( + Map configProperties, + BufferAllocator allocator, + DynamicContextProvider contextProvider) { if (this.nativeRestNamespaceHandle != 0) { throw new IllegalStateException("RestNamespace already initialized"); } this.allocator = allocator; - this.nativeRestNamespaceHandle = createNative(configProperties); + + // If no explicit provider, try to create from properties + DynamicContextProvider provider = contextProvider; + if (provider == null) { + provider = createProviderFromProperties(configProperties).orElse(null); + } + + // Filter out provider properties before passing to native layer + Map filteredProperties = filterProviderProperties(configProperties); + + if (provider != null) { + this.nativeRestNamespaceHandle = createNativeWithProvider(filteredProperties, provider); + } else { + this.nativeRestNamespaceHandle = createNative(filteredProperties); + } } @Override @@ -196,6 +235,14 @@ public CreateEmptyTableResponse createEmptyTable(CreateEmptyTableRequest request return fromJson(responseJson, CreateEmptyTableResponse.class); } + @Override + public DeclareTableResponse declareTable(DeclareTableRequest request) { + ensureInitialized(); + String requestJson = toJson(request); + String responseJson = declareTableNative(nativeRestNamespaceHandle, requestJson); + return fromJson(responseJson, DeclareTableResponse.class); + } + @Override public InsertIntoTableResponse insertIntoTable( InsertIntoTableRequest request, byte[] requestData) { @@ -313,6 +360,9 @@ private static T fromJson(String json, Class clazz) { // Native methods private native long createNative(Map properties); + private native long createNativeWithProvider( + Map properties, DynamicContextProvider contextProvider); + private native void releaseNative(long handle); private native String namespaceIdNative(long handle); @@ -345,6 +395,8 @@ private static T fromJson(String json, Class clazz) { private native String createEmptyTableNative(long handle, String requestJson); + private native String declareTableNative(long handle, String requestJson); + private native String insertIntoTableNative(long handle, String requestJson, byte[] requestData); private native String mergeInsertIntoTableNative( @@ -365,4 +417,77 @@ private native String mergeInsertIntoTableNative( private native String describeTransactionNative(long handle, String requestJson); private native String alterTransactionNative(long handle, String requestJson); + + // ========================================================================== + // Provider loading helpers + // ========================================================================== + + private static final String PROVIDER_PREFIX = "dynamic_context_provider."; + private static final String IMPL_KEY = "dynamic_context_provider.impl"; + + /** + * Create a context provider from properties if configured. + * + *

Loads the class specified by {@code dynamic_context_provider.impl} from the class path and + * instantiates it with the extracted provider properties. + */ + private static Optional createProviderFromProperties( + Map properties) { + String className = properties.get(IMPL_KEY); + if (className == null || className.isEmpty()) { + return Optional.empty(); + } + + // Extract provider-specific properties (strip prefix, exclude impl key) + Map providerProps = new HashMap<>(); + for (Map.Entry entry : properties.entrySet()) { + String key = entry.getKey(); + if (key.startsWith(PROVIDER_PREFIX) && !key.equals(IMPL_KEY)) { + String propName = key.substring(PROVIDER_PREFIX.length()); + providerProps.put(propName, entry.getValue()); + } + } + + try { + Class providerClass = Class.forName(className); + if (!DynamicContextProvider.class.isAssignableFrom(providerClass)) { + throw new IllegalArgumentException( + String.format( + "Class '%s' does not implement DynamicContextProvider interface", className)); + } + + @SuppressWarnings("unchecked") + Class typedClass = + (Class) providerClass; + + Constructor constructor = + typedClass.getConstructor(Map.class); + return Optional.of(constructor.newInstance(providerProps)); + + } catch (ClassNotFoundException e) { + throw new IllegalArgumentException( + String.format("Failed to load context provider class '%s': %s", className, e), e); + } catch (NoSuchMethodException e) { + throw new IllegalArgumentException( + String.format( + "Context provider class '%s' must have a public constructor " + + "that accepts Map", + className), + e); + } catch (ReflectiveOperationException e) { + throw new IllegalArgumentException( + String.format("Failed to instantiate context provider '%s': %s", className, e), e); + } + } + + /** Filter out dynamic_context_provider.* properties from the map. */ + private static Map filterProviderProperties(Map properties) { + Map filtered = new HashMap<>(); + for (Map.Entry entry : properties.entrySet()) { + if (!entry.getKey().startsWith(PROVIDER_PREFIX)) { + filtered.put(entry.getKey(), entry.getValue()); + } + } + return filtered; + } } diff --git a/java/src/test/java/org/lance/NamespaceIntegrationTest.java b/java/src/test/java/org/lance/NamespaceIntegrationTest.java index d2ea43f5e53..2d6f8ab1443 100644 --- a/java/src/test/java/org/lance/NamespaceIntegrationTest.java +++ b/java/src/test/java/org/lance/NamespaceIntegrationTest.java @@ -18,6 +18,8 @@ import org.lance.namespace.LanceNamespaceStorageOptionsProvider; import org.lance.namespace.model.CreateEmptyTableRequest; import org.lance.namespace.model.CreateEmptyTableResponse; +import org.lance.namespace.model.DeclareTableRequest; +import org.lance.namespace.model.DeclareTableResponse; import org.lance.namespace.model.DescribeTableRequest; import org.lance.namespace.model.DescribeTableResponse; import org.lance.operation.Append; @@ -201,6 +203,8 @@ private Map modifyStorageOptions( long expiresAtMillis = System.currentTimeMillis() + (credentialExpiresInSeconds * 1000L); modified.put("expires_at_millis", String.valueOf(expiresAtMillis)); + // Set refresh offset to 1 second (1000ms) for short-lived credential tests + modified.put("refresh_offset_millis", "1000"); return modified; } @@ -215,6 +219,16 @@ public CreateEmptyTableResponse createEmptyTable(CreateEmptyTableRequest request return response; } + @Override + public DeclareTableResponse declareTable(DeclareTableRequest request) { + int count = createCallCount.incrementAndGet(); + + DeclareTableResponse response = inner.declareTable(request); + response.setStorageOptions(modifyStorageOptions(response.getStorageOptions(), count)); + + return response; + } + @Override public DescribeTableResponse describeTable(DescribeTableRequest request) { int count = describeCallCount.incrementAndGet(); @@ -314,11 +328,7 @@ public VectorSchemaRoot getVectorSchemaRoot() { assertEquals(1, namespace.getCreateCallCount(), "createEmptyTable should be called once"); // Open dataset through namespace WITH refresh enabled - // Use 10-second refresh offset, so credentials effectively expire at T+50s - ReadOptions readOptions = - new ReadOptions.Builder() - .setS3CredentialsRefreshOffsetSeconds(10) // Refresh 10s before expiration - .build(); + ReadOptions readOptions = new ReadOptions.Builder().build(); int callCountBeforeOpen = namespace.getDescribeCallCount(); try (Dataset dsFromNamespace = @@ -439,7 +449,6 @@ public VectorSchemaRoot getVectorSchemaRoot() { .namespace(namespace) .tableId(Arrays.asList(tableName)) .mode(WriteParams.WriteMode.CREATE) - .s3CredentialsRefreshOffsetSeconds(2) // Refresh 2s before expiration .execute()) { assertEquals(2, dataset.countRows()); } @@ -449,11 +458,7 @@ public VectorSchemaRoot getVectorSchemaRoot() { assertEquals(1, namespace.getCreateCallCount(), "createEmptyTable should be called once"); // Open dataset through namespace with refresh enabled - // Use 2-second refresh offset so credentials effectively expire at T+3s (5s - 2s) - ReadOptions readOptions = - new ReadOptions.Builder() - .setS3CredentialsRefreshOffsetSeconds(2) // Refresh 2s before expiration - .build(); + ReadOptions readOptions = new ReadOptions.Builder().build(); int callCountBeforeOpen = namespace.getDescribeCallCount(); try (Dataset dsFromNamespace = @@ -680,7 +685,6 @@ public VectorSchemaRoot getVectorSchemaRoot() { }; // Use the write builder to create a dataset through namespace - // Set a 1-second refresh offset. Credentials expire at T+60s, so refresh at T+59s. // Write completes instantly, so NO describeTable call should happen for refresh. try (Dataset dataset = Dataset.write() @@ -689,7 +693,6 @@ public VectorSchemaRoot getVectorSchemaRoot() { .namespace(namespace) .tableId(Arrays.asList(tableName)) .mode(WriteParams.WriteMode.CREATE) - .s3CredentialsRefreshOffsetSeconds(1) .execute()) { // Verify createEmptyTable was called exactly ONCE @@ -720,9 +723,7 @@ public VectorSchemaRoot getVectorSchemaRoot() { "describeTable should still be 0 after close (no refresh needed)"); // Now open the dataset through namespace with long-lived credentials (60s expiration) - // With 1s refresh offset, credentials are valid for 59s - plenty of time for reads - ReadOptions readOptions = - new ReadOptions.Builder().setS3CredentialsRefreshOffsetSeconds(1).build(); + ReadOptions readOptions = new ReadOptions.Builder().build(); try (Dataset dsFromNamespace = Dataset.open() diff --git a/java/src/test/java/org/lance/namespace/DynamicContextProviderTest.java b/java/src/test/java/org/lance/namespace/DynamicContextProviderTest.java new file mode 100644 index 00000000000..7959eb9be58 --- /dev/null +++ b/java/src/test/java/org/lance/namespace/DynamicContextProviderTest.java @@ -0,0 +1,307 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.lance.namespace; + +import org.lance.namespace.model.*; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.nio.file.Path; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.junit.jupiter.api.Assertions.*; + +/** Tests for DynamicContextProvider interface. */ +public class DynamicContextProviderTest { + @TempDir Path tempDir; + + private BufferAllocator allocator; + + @BeforeEach + void setUp() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @AfterEach + void tearDown() { + if (allocator != null) { + allocator.close(); + } + } + + @Test + void testDirectoryNamespaceWithContextProvider() { + AtomicInteger callCount = new AtomicInteger(0); + + DynamicContextProvider provider = + (operation, objectId) -> { + callCount.incrementAndGet(); + Map context = new HashMap<>(); + context.put("headers.Authorization", "Bearer test-token-123"); + context.put("headers.X-Request-Id", "req-" + operation); + return context; + }; + + try (DirectoryNamespace namespace = new DirectoryNamespace()) { + Map config = new HashMap<>(); + config.put("root", tempDir.toString()); + namespace.initialize(config, allocator, provider); + + // Perform operations to verify the provider is called + CreateNamespaceRequest createReq = + new CreateNamespaceRequest().id(Arrays.asList("workspace")); + namespace.createNamespace(createReq); + + ListNamespacesRequest listReq = new ListNamespacesRequest(); + namespace.listNamespaces(listReq); + + // The provider should have been called for each operation + // Note: DirectoryNamespace stores the provider but may not actively use context + // until the underlying Rust code is updated to use it for credential vending + assertNotNull(namespace.namespaceId()); + } + } + + @Test + void testDirectoryNamespaceWithNullProvider() { + try (DirectoryNamespace namespace = new DirectoryNamespace()) { + Map config = new HashMap<>(); + config.put("root", tempDir.toString()); + + // Should work with null provider (backward compatibility) + namespace.initialize(config, allocator, null); + + CreateNamespaceRequest createReq = + new CreateNamespaceRequest().id(Arrays.asList("workspace")); + namespace.createNamespace(createReq); + + ListNamespacesRequest listReq = new ListNamespacesRequest(); + ListNamespacesResponse listResp = namespace.listNamespaces(listReq); + + assertNotNull(listResp); + assertTrue(listResp.getNamespaces().contains("workspace")); + } + } + + @Test + void testContextProviderReturnsEmptyMap() { + DynamicContextProvider provider = (operation, objectId) -> new HashMap<>(); + + try (DirectoryNamespace namespace = new DirectoryNamespace()) { + Map config = new HashMap<>(); + config.put("root", tempDir.toString()); + namespace.initialize(config, allocator, provider); + + CreateNamespaceRequest createReq = + new CreateNamespaceRequest().id(Arrays.asList("workspace")); + CreateNamespaceResponse resp = namespace.createNamespace(createReq); + + assertNotNull(resp); + } + } + + @Test + void testRestNamespaceWithContextProviderIntegration() { + AtomicInteger callCount = new AtomicInteger(0); + + DynamicContextProvider provider = + (operation, objectId) -> { + callCount.incrementAndGet(); + Map context = new HashMap<>(); + context.put("headers.Authorization", "Bearer xyz-token"); + context.put("headers.X-Trace-Id", "trace-" + System.currentTimeMillis()); + return context; + }; + + // Start a test REST server with DirectoryNamespace backend + Map backendConfig = new HashMap<>(); + backendConfig.put("root", tempDir.toString()); + + try (RestAdapter adapter = new RestAdapter("dir", backendConfig, "127.0.0.1", null)) { + adapter.start(); + int port = adapter.getPort(); + + // Create RestNamespace client with context provider + try (RestNamespace namespace = new RestNamespace()) { + Map clientConfig = new HashMap<>(); + clientConfig.put("uri", "http://127.0.0.1:" + port); + namespace.initialize(clientConfig, allocator, provider); + + // Perform operations - context provider should be called + CreateNamespaceRequest createReq = + new CreateNamespaceRequest().id(Arrays.asList("workspace")); + namespace.createNamespace(createReq); + + ListNamespacesRequest listReq = new ListNamespacesRequest(); + ListNamespacesResponse listResp = namespace.listNamespaces(listReq); + + // Verify provider was called for REST operations + assertTrue(callCount.get() >= 2, "Context provider should be called for each operation"); + assertNotNull(listResp); + assertTrue(listResp.getNamespaces().contains("workspace")); + } + } + } + + @Test + void testContextProviderReceivesCorrectOperationInfo() { + Map capturedOperations = new HashMap<>(); + + DynamicContextProvider provider = + (operation, objectId) -> { + capturedOperations.put(operation, objectId); + return new HashMap<>(); + }; + + Map backendConfig = new HashMap<>(); + backendConfig.put("root", tempDir.toString()); + + try (RestAdapter adapter = new RestAdapter("dir", backendConfig, "127.0.0.1", null)) { + adapter.start(); + int port = adapter.getPort(); + + try (RestNamespace namespace = new RestNamespace()) { + Map clientConfig = new HashMap<>(); + clientConfig.put("uri", "http://127.0.0.1:" + port); + namespace.initialize(clientConfig, allocator, provider); + + // Create namespace + CreateNamespaceRequest createReq = + new CreateNamespaceRequest().id(Arrays.asList("workspace")); + namespace.createNamespace(createReq); + + // List namespaces + ListNamespacesRequest listReq = new ListNamespacesRequest(); + namespace.listNamespaces(listReq); + + // Verify operations were captured + assertTrue(capturedOperations.containsKey("create_namespace")); + assertTrue(capturedOperations.containsKey("list_namespaces")); + } + } + } + + // ========================================================================== + // Class path based provider tests + // ========================================================================== + + @Test + void testDirectoryNamespaceWithClassPathProvider() { + try (DirectoryNamespace namespace = new DirectoryNamespace()) { + Map config = new HashMap<>(); + config.put("root", tempDir.toString()); + config.put("dynamic_context_provider.impl", "org.lance.namespace.TestContextProvider"); + config.put("dynamic_context_provider.token", "my-secret-token"); + config.put("dynamic_context_provider.prefix", "Token"); + + namespace.initialize(config, allocator); + + // Verify namespace works + CreateNamespaceRequest createReq = + new CreateNamespaceRequest().id(Arrays.asList("workspace")); + namespace.createNamespace(createReq); + + ListNamespacesRequest listReq = new ListNamespacesRequest(); + ListNamespacesResponse listResp = namespace.listNamespaces(listReq); + + assertNotNull(listResp); + assertTrue(listResp.getNamespaces().contains("workspace")); + } + } + + @Test + void testRestNamespaceWithClassPathProvider() { + Map backendConfig = new HashMap<>(); + backendConfig.put("root", tempDir.toString()); + + try (RestAdapter adapter = new RestAdapter("dir", backendConfig, "127.0.0.1", null)) { + adapter.start(); + int port = adapter.getPort(); + + try (RestNamespace namespace = new RestNamespace()) { + Map clientConfig = new HashMap<>(); + clientConfig.put("uri", "http://127.0.0.1:" + port); + clientConfig.put( + "dynamic_context_provider.impl", "org.lance.namespace.TestContextProvider"); + clientConfig.put("dynamic_context_provider.token", "secret-api-key"); + + namespace.initialize(clientConfig, allocator); + + CreateNamespaceRequest createReq = + new CreateNamespaceRequest().id(Arrays.asList("workspace")); + namespace.createNamespace(createReq); + + ListNamespacesRequest listReq = new ListNamespacesRequest(); + ListNamespacesResponse listResp = namespace.listNamespaces(listReq); + + assertNotNull(listResp); + assertTrue(listResp.getNamespaces().contains("workspace")); + } + } + } + + @Test + void testUnknownProviderClassThrowsException() { + try (DirectoryNamespace namespace = new DirectoryNamespace()) { + Map config = new HashMap<>(); + config.put("root", tempDir.toString()); + config.put("dynamic_context_provider.impl", "com.nonexistent.NonExistentProvider"); + + assertThrows( + IllegalArgumentException.class, + () -> namespace.initialize(config, allocator), + "Failed to load context provider class"); + } + } + + @Test + void testExplicitProviderTakesPrecedence() { + AtomicInteger explicitCallCount = new AtomicInteger(0); + + DynamicContextProvider explicitProvider = + (operation, objectId) -> { + explicitCallCount.incrementAndGet(); + Map ctx = new HashMap<>(); + ctx.put("headers.Authorization", "Bearer explicit"); + return ctx; + }; + + try (DirectoryNamespace namespace = new DirectoryNamespace()) { + Map config = new HashMap<>(); + config.put("root", tempDir.toString()); + // Even though we specify a class path, explicit provider should take precedence + config.put("dynamic_context_provider.impl", "org.lance.namespace.TestContextProvider"); + config.put("dynamic_context_provider.token", "ignored"); + + // Pass explicit provider - should take precedence over properties + namespace.initialize(config, allocator, explicitProvider); + + // Verify namespace works + CreateNamespaceRequest createReq = + new CreateNamespaceRequest().id(Arrays.asList("workspace")); + namespace.createNamespace(createReq); + + // Namespace should work + assertNotNull(namespace.namespaceId()); + } + } +} diff --git a/java/src/test/java/org/lance/namespace/TestContextProvider.java b/java/src/test/java/org/lance/namespace/TestContextProvider.java new file mode 100644 index 00000000000..4eea30c88c3 --- /dev/null +++ b/java/src/test/java/org/lance/namespace/TestContextProvider.java @@ -0,0 +1,36 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.lance.namespace; + +import java.util.HashMap; +import java.util.Map; + +/** Test implementation of DynamicContextProvider for testing class path loading. */ +public class TestContextProvider implements DynamicContextProvider { + private final String token; + private final String prefix; + + public TestContextProvider(Map properties) { + this.token = properties.get("token"); + this.prefix = properties.getOrDefault("prefix", "Bearer"); + } + + @Override + public Map provideContext(String operation, String objectId) { + Map context = new HashMap<>(); + context.put("headers.Authorization", prefix + " " + token); + context.put("headers.X-Operation", operation); + return context; + } +} diff --git a/python/Cargo.lock b/python/Cargo.lock index bc52b611093..10753904f26 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "RustyXML" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5" + [[package]] name = "abi_stable" version = "0.11.3" @@ -463,6 +469,17 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "async-channel" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" +dependencies = [ + "concurrent-queue", + "event-listener 2.5.3", + "futures-core", +] + [[package]] name = "async-channel" version = "2.5.0" @@ -501,17 +518,53 @@ dependencies = [ "abi_stable", ] +[[package]] +name = "async-io" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "456b8a8feb6f42d237746d4b3e9a178494627745c3c56c6ea55d92ba50d026fc" +dependencies = [ + "autocfg", + "cfg-if", + "concurrent-queue", + "futures-io", + "futures-lite 2.6.1", + "parking", + "polling", + "rustix 1.1.3", + "slab", + "windows-sys 0.61.2", +] + [[package]] name = "async-lock" version = "3.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f7f2596bd5b78a9fec8088ccd89180d7f9f55b94b0576823bbbdc72ee8311" dependencies = [ - "event-listener", + "event-listener 5.4.1", "event-listener-strategy", "pin-project-lite", ] +[[package]] +name = "async-process" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc50921ec0055cdd8a16de48773bfeec5c972598674347252c0399676be7da75" +dependencies = [ + "async-channel 2.5.0", + "async-io", + "async-lock", + "async-signal", + "async-task", + "blocking", + "cfg-if", + "event-listener 5.4.1", + "futures-lite 2.6.1", + "rustix 1.1.3", +] + [[package]] name = "async-recursion" version = "1.1.1" @@ -523,6 +576,30 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "async-signal" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43c070bbf59cd3570b6b2dd54cd772527c7c3620fce8be898406dd3ed6adc64c" +dependencies = [ + "async-io", + "async-lock", + "atomic-waker", + "cfg-if", + "futures-core", + "futures-io", + "rustix 1.1.3", + "signal-hook-registry", + "slab", + "windows-sys 0.61.2", +] + +[[package]] +name = "async-task" +version = "4.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" + [[package]] name = "async-trait" version = "0.1.89" @@ -583,7 +660,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "hex", "http 1.4.0", "ring", @@ -608,9 +685,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.15.2" +version = "1.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a88aab2464f1f25453baa7a07c84c5b7684e274054ba06817f382357f77a288" +checksum = "e84ce723ab67259cfeb9877c6a639ee9eb7a27b28123abd71db7f0d5d0cc9d86" dependencies = [ "aws-lc-sys", "zeroize", @@ -618,9 +695,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.35.0" +version = "0.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b45afffdee1e7c9126814751f88dddc747f41d91da16c9551a0f1e8a11e788a1" +checksum = "43a442ece363113bd4bd4c8b18977a7798dd4d3c3383f34fb61936960e8f4ad8" dependencies = [ "cc", "cmake", @@ -630,9 +707,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.17" +version = "1.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d81b5b2898f6798ad58f484856768bca817e3cd9de0974c24ae0f1113fe88f1b" +checksum = "959dab27ce613e6c9658eb3621064d0e2027e5f2acb65bc526a43577facea557" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -643,7 +720,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "http-body 0.4.6", "percent-encoding", @@ -654,21 +731,22 @@ dependencies = [ [[package]] name = "aws-sdk-dynamodb" -version = "1.101.0" +version = "1.102.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6f98cd9e5f2fc790aff1f393bc3c8680deea31c05d3c6f23b625cdc50b1b6b4" +checksum = "f5f7e6a53cf5ee8b7041c73106d9a93480b47f8b955466262b043aab0b5bf489" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -676,21 +754,22 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.91.0" +version = "1.92.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ee6402a36f27b52fe67661c6732d684b2635152b676aa2babbfb5204f99115d" +checksum = "b7d63bd2bdeeb49aa3f9b00c15e18583503b778b2e792fc06284d54e7d5b6566" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -698,21 +777,22 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.93.0" +version = "1.94.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a45a7f750bbd170ee3677671ad782d90b894548f4e4ae168302c57ec9de5cb3e" +checksum = "532d93574bf731f311bafb761366f9ece345a0416dbcc273d81d6d1a1205239b" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -720,22 +800,23 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.95.0" +version = "1.96.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55542378e419558e6b1f398ca70adb0b2088077e79ad9f14eb09441f2f7b2164" +checksum = "357e9a029c7524db6a0099cd77fbd5da165540339e7296cca603531bc783b56c" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-smithy-xml", "aws-types", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -836,9 +917,9 @@ dependencies = [ [[package]] name = "aws-smithy-observability" -version = "0.1.5" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f616c3f2260612fe44cede278bafa18e73e6479c4e393e2c4518cf2a9a228a" +checksum = "ef1fcbefc7ece1d70dcce29e490f269695dfca2d2bacdeaf9e5c3f799e4e6a42" dependencies = [ "aws-smithy-runtime-api", ] @@ -855,9 +936,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.9.5" +version = "1.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a392db6c583ea4a912538afb86b7be7c5d8887d91604f50eb55c262ee1b4a5f5" +checksum = "bb5b6167fcdf47399024e81ac08e795180c576a20e4d4ce67949f9a88ae37dc1" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -866,7 +947,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "http 1.4.0", "http-body 0.4.6", @@ -879,9 +960,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.9.3" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab0d43d899f9e508300e587bf582ba54c27a452dd0a9ea294690669138ae14a2" +checksum = "efce7aaaf59ad53c5412f14fc19b2d5c6ab2c3ec688d272fd31f76ec12f44fb0" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -896,9 +977,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.3.5" +version = "1.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "905cb13a9895626d49cf2ced759b062d913834c7482c38e49557eac4e6193f01" +checksum = "65f172bcb02424eb94425db8aed1b6d583b5104d4d5ddddf22402c661a320048" dependencies = [ "base64-simd", "bytes", @@ -998,17 +1079,130 @@ dependencies = [ "tracing", ] +[[package]] +name = "azure_core" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b552ad43a45a746461ec3d3a51dfb6466b4759209414b439c165eb6a6b7729e" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes", + "dyn-clone", + "futures", + "getrandom 0.2.17", + "hmac", + "http-types", + "once_cell", + "paste", + "pin-project", + "quick-xml 0.31.0", + "rand 0.8.5", + "reqwest", + "rustc_version", + "serde", + "serde_json", + "sha2", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_identity" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ddd80344317c40c04b603807b63a5cefa532f1b43522e72f480a988141f744" +dependencies = [ + "async-lock", + "async-process", + "async-trait", + "azure_core", + "futures", + "oauth2", + "pin-project", + "serde", + "time", + "tracing", + "tz-rs", + "url", + "uuid", +] + +[[package]] +name = "azure_storage" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59f838159f4d29cb400a14d9d757578ba495ae64feb07a7516bf9e4415127126" +dependencies = [ + "RustyXML", + "async-lock", + "async-trait", + "azure_core", + "bytes", + "serde", + "serde_derive", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_storage_blobs" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97e83c3636ae86d9a6a7962b2112e3b19eb3903915c50ce06ff54ff0a2e6a7e4" +dependencies = [ + "RustyXML", + "azure_core", + "azure_storage", + "azure_svc_blobstorage", + "bytes", + "futures", + "serde", + "serde_derive", + "serde_json", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_svc_blobstorage" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e6c6f20c5611b885ba94c7bae5e02849a267381aecb8aee577e8c35ff4064c6" +dependencies = [ + "azure_core", + "bytes", + "futures", + "log", + "once_cell", + "serde", + "serde_json", + "time", +] + [[package]] name = "backon" version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cffb0e931875b666fc4fcb20fee52e9bbd1ef836fd9e9e04ec21555f9f85f7ef" dependencies = [ - "fastrand", + "fastrand 2.3.0", "gloo-timers", "tokio", ] +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.21.7" @@ -1144,6 +1338,19 @@ dependencies = [ "generic-array", ] +[[package]] +name = "blocking" +version = "1.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e83f8d02be6967315521be875afa792a316e28d57b5a2d401897e2a7921b7f21" +dependencies = [ + "async-channel 2.5.0", + "async-task", + "futures-io", + "futures-lite 2.6.1", + "piper", +] + [[package]] name = "bon" version = "3.8.2" @@ -1263,9 +1470,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.52" +version = "1.2.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd4932aefd12402b36c60956a4fe0035421f544799057659ff86f923657aada3" +checksum = "755d2fce177175ffca841e9a06afdb2c4ab0f593d53b4dee48147dfaade85932" dependencies = [ "find-msvc-tools", "jobserver", @@ -1302,9 +1509,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.42" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" +checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" dependencies = [ "iana-time-zone", "js-sys", @@ -1395,6 +1602,12 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "const_fn" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f8a2ca5ac02d09563609681103aada9e1777d54fc57a5acd7a41404f9c93b6e" + [[package]] name = "const_panic" version = "0.2.15" @@ -1410,6 +1623,16 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation" version = "0.10.1" @@ -2644,6 +2867,12 @@ version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca81e6b4777c89fd810c25a4be2b1bd93ea034fbe58e6a75216a34c6b82c539b" +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + [[package]] name = "event-listener" version = "5.4.1" @@ -2661,7 +2890,7 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" dependencies = [ - "event-listener", + "event-listener 5.4.1", "pin-project-lite", ] @@ -2677,6 +2906,15 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9afc2bd4d5a73106dd53d10d73d3401c2f32730ba2c0b93ddb888a8983680471" +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -2685,21 +2923,20 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "filetime" -version = "0.2.26" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc0505cd1b6fa6580283f6bdf70a73fcf4aba1184038c90902b92b3dd0df63ed" +checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" dependencies = [ "cfg-if", "libc", "libredox", - "windows-sys 0.60.2", ] [[package]] name = "find-msvc-tools" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f449e6c6c08c865631d4890cfacf252b3d396c9bcc83adb6623cdb02a8336c41" +checksum = "8591b0bcc8a98a64310a2fae1bb3e9b8564dd10e381e6e28010fde8e8e8568db" [[package]] name = "fixedbitset" @@ -2752,6 +2989,21 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -2848,6 +3100,34 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +[[package]] +name = "futures-lite" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" +dependencies = [ + "fastrand 1.9.0", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + +[[package]] +name = "futures-lite" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f78e10609fe0e0b3f4157ffab1876319b5b0db102a2c60dc4626306dc46b44ad" +dependencies = [ + "fastrand 2.3.0", + "futures-core", + "futures-io", + "parking", + "pin-project-lite", +] + [[package]] name = "futures-macro" version = "0.3.31" @@ -3045,6 +3325,17 @@ dependencies = [ "libm", ] +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -3054,7 +3345,7 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi", + "wasi 0.11.1+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -3090,6 +3381,26 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "google-cloud-auth" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5572275b7f06b6fde8eec61a23d87c83aae362bee586bbeb8773b3f98658ae81" +dependencies = [ + "async-trait", + "base64 0.22.1", + "derive_builder", + "http 1.4.0", + "reqwest", + "rustls 0.23.36", + "rustls-pemfile", + "serde", + "serde_json", + "thiserror 2.0.18", + "time", + "tokio", +] + [[package]] name = "h2" version = "0.3.27" @@ -3288,6 +3599,26 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-types" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e9b187a72d63adbfba487f48095306ac823049cb504ee195541e91c7775f5ad" +dependencies = [ + "anyhow", + "async-channel 1.9.0", + "base64 0.13.1", + "futures-lite 1.13.0", + "infer", + "pin-project-lite", + "rand 0.7.3", + "serde", + "serde_json", + "serde_qs", + "serde_urlencoded", + "url", +] + [[package]] name = "httparse" version = "1.10.1" @@ -3386,6 +3717,22 @@ dependencies = [ "webpki-roots", ] +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper 1.8.1", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.19" @@ -3405,9 +3752,11 @@ dependencies = [ "percent-encoding", "pin-project-lite", "socket2 0.6.1", + "system-configuration", "tokio", "tower-service", "tracing", + "windows-registry", ] [[package]] @@ -3650,6 +3999,12 @@ dependencies = [ "rustversion", ] +[[package]] +name = "infer" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64e9829a50b42bb782c1df523f78d332fe371b10c661e78b7a3c34b0198e9fac" + [[package]] name = "inout" version = "0.1.4" @@ -3660,6 +4015,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", +] + [[package]] name = "integer-encoding" version = "3.0.4" @@ -3797,9 +4161,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.83" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "464a3709c7f55f1f721e5389aa6ea4e3bc6aba669353300af094b29ffbdde1d8" +checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" dependencies = [ "once_cell", "wasm-bindgen", @@ -4115,7 +4479,7 @@ dependencies = [ "arrow-ord", "arrow-schema", "arrow-select", - "async-channel", + "async-channel 2.5.0", "async-recursion", "async-trait", "bitpacking", @@ -4245,9 +4609,19 @@ dependencies = [ "arrow-ipc", "arrow-schema", "async-trait", + "aws-config", + "aws-credential-types", + "aws-sdk-sts", "axum", + "azure_core", + "azure_identity", + "azure_storage", + "azure_storage_blobs", + "base64 0.22.1", "bytes", + "chrono", "futures", + "google-cloud-auth", "lance", "lance-core", "lance-index", @@ -4259,7 +4633,9 @@ dependencies = [ "reqwest", "serde", "serde_json", + "sha2", "snafu", + "time", "tokio", "tower", "tower-http 0.5.2", @@ -4268,9 +4644,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.0.18" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ea349999bcda4eea53fc05d334b3775ec314761e6a706555c777d7a29b18d19" +checksum = "a2acdba67f84190067532fce07b51a435dd390d7cdc1129a05003e5cb3274cf0" dependencies = [ "reqwest", "serde", @@ -4520,7 +4896,7 @@ dependencies = [ "reqwest", "serde", "tar", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "yada", ] @@ -4804,7 +5180,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" dependencies = [ "libc", - "wasi", + "wasi 0.11.1+wasi-snapshot-preview1", "windows-sys 0.61.2", ] @@ -4825,7 +5201,7 @@ dependencies = [ "crossbeam-epoch", "crossbeam-utils", "equivalent", - "event-listener", + "event-listener 5.4.1", "futures-util", "parking_lot", "portable-atomic", @@ -4846,6 +5222,23 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" +[[package]] +name = "native-tls" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe 0.1.6", + "openssl-sys", + "schannel", + "security-framework 2.11.1", + "security-framework-sys", + "tempfile", +] + [[package]] name = "ndarray" version = "0.16.1" @@ -5017,6 +5410,34 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "num_threads" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9" +dependencies = [ + "libc", +] + +[[package]] +name = "oauth2" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c38841cdd844847e3e7c8d29cef9dcfed8877f8f56f9071f77843ecf3baf937f" +dependencies = [ + "base64 0.13.1", + "chrono", + "getrandom 0.2.17", + "http 0.2.12", + "rand 0.8.5", + "serde", + "serde_json", + "serde_path_to_error", + "sha2", + "thiserror 1.0.69", + "url", +] + [[package]] name = "object" version = "0.32.2" @@ -5028,9 +5449,9 @@ dependencies = [ [[package]] name = "object_store" -version = "0.12.4" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c1be0c6c22ec0817cdc77d3842f721a17fd30ab6965001415b5402a74e6b740" +checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" dependencies = [ "async-trait", "base64 0.22.1", @@ -5055,7 +5476,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tracing", "url", @@ -5128,12 +5549,56 @@ dependencies = [ "uuid", ] +[[package]] +name = "openssl" +version = "0.10.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" +dependencies = [ + "bitflags 2.10.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "openssl-probe" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" + [[package]] name = "openssl-probe" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f50d9b3dabb09ecd771ad0aa242ca6894994c130308ca3d7684634df8037391" +[[package]] +name = "openssl-sys" +version = "0.9.111" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -5402,7 +5867,7 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" dependencies = [ - "fastrand", + "fastrand 2.3.0", "phf_shared 0.13.1", ] @@ -5456,6 +5921,17 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "piper" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96c8c490f422ef9a4efd2cb5b42b76c8613d7e7dfc1caf667b8a3350a5acc066" +dependencies = [ + "atomic-waker", + "fastrand 2.3.0", + "futures-io", +] + [[package]] name = "pkcs1" version = "0.7.5" @@ -5500,6 +5976,20 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "polling" +version = "3.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d0e4f59085d47d8241c88ead0f274e8a0cb551f3625263c05eb8dd897c34218" +dependencies = [ + "cfg-if", + "concurrent-queue", + "hermit-abi", + "pin-project-lite", + "rustix 1.1.3", + "windows-sys 0.61.2", +] + [[package]] name = "portable-atomic" version = "1.13.0" @@ -5777,6 +6267,16 @@ dependencies = [ "serde", ] +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quick-xml" version = "0.37.5" @@ -5811,7 +6311,7 @@ dependencies = [ "rustc-hash", "rustls 0.23.36", "socket2 0.6.1", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tracing", "web-time", @@ -5832,7 +6332,7 @@ dependencies = [ "rustls 0.23.36", "rustls-pki-types", "slab", - "thiserror 2.0.17", + "thiserror 2.0.18", "tinyvec", "tracing", "web-time", @@ -5873,6 +6373,19 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", +] + [[package]] name = "rand" version = "0.8.5" @@ -5894,6 +6407,16 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -5914,6 +6437,15 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + [[package]] name = "rand_core" version = "0.6.4" @@ -5952,6 +6484,15 @@ dependencies = [ "rand 0.9.2", ] +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + [[package]] name = "rand_xoshiro" version = "0.7.0" @@ -6052,7 +6593,7 @@ checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" dependencies = [ "getrandom 0.2.17", "libredox", - "thiserror 2.0.17", + "thiserror 2.0.18", ] [[package]] @@ -6158,11 +6699,13 @@ dependencies = [ "http-body-util", "hyper 1.8.1", "hyper-rustls 0.27.7", + "hyper-tls", "hyper-util", "js-sys", "log", "mime", "mime_guess", + "native-tls", "percent-encoding", "pin-project-lite", "quinn", @@ -6174,6 +6717,7 @@ dependencies = [ "serde_urlencoded", "sync_wrapper", "tokio", + "tokio-native-tls", "tokio-rustls 0.26.4", "tokio-util", "tower", @@ -6335,10 +6879,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" dependencies = [ "aws-lc-rs", + "log", "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.103.8", + "rustls-webpki 0.103.9", "subtle", "zeroize", ] @@ -6349,10 +6894,10 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" dependencies = [ - "openssl-probe", + "openssl-probe 0.2.0", "rustls-pki-types", "schannel", - "security-framework", + "security-framework 3.5.1", ] [[package]] @@ -6366,9 +6911,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.13.2" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21e6f2ab2928ca4291b86736a8bd920a277a399bba1589409d72154ff87c1282" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" dependencies = [ "web-time", "zeroize", @@ -6386,9 +6931,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.8" +version = "0.103.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" +checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" dependencies = [ "aws-lc-rs", "ring", @@ -6492,6 +7037,19 @@ dependencies = [ "untrusted", ] +[[package]] +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags 2.10.0", + "core-foundation 0.9.4", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + [[package]] name = "security-framework" version = "3.5.1" @@ -6499,7 +7057,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" dependencies = [ "bitflags 2.10.0", - "core-foundation", + "core-foundation 0.10.1", "core-foundation-sys", "libc", "security-framework-sys", @@ -6596,6 +7154,17 @@ dependencies = [ "serde_core", ] +[[package]] +name = "serde_qs" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6" +dependencies = [ + "percent-encoding", + "serde", + "thiserror 1.0.69", +] + [[package]] name = "serde_repr" version = "0.1.20" @@ -6730,7 +7299,7 @@ checksum = "297f631f50729c8c99b84667867963997ec0b50f32b2a7dbcab828ef0541e8bb" dependencies = [ "num-bigint", "num-traits", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", ] @@ -7011,6 +7580,27 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags 2.10.0", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tagptr" version = "0.2.0" @@ -7063,7 +7653,7 @@ dependencies = [ "tantivy-stacker", "tantivy-tokenizer-api", "tempfile", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", "uuid", "winapi", @@ -7192,7 +7782,7 @@ version = "3.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c" dependencies = [ - "fastrand", + "fastrand 2.3.0", "getrandom 0.3.4", "once_cell", "rustix 1.1.3", @@ -7210,11 +7800,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl 2.0.17", + "thiserror-impl 2.0.18", ] [[package]] @@ -7230,9 +7820,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", @@ -7276,7 +7866,10 @@ checksum = "f9e442fc33d7fdb45aa9bfeb312c095964abdf596f7567261062b2a7107aaabd" dependencies = [ "deranged", "itoa", + "js-sys", + "libc", "num-conv", + "num_threads", "powerfmt", "serde_core", "time-core", @@ -7361,6 +7954,16 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.24.1" @@ -7650,7 +8253,7 @@ dependencies = [ "serde", "serde_json", "syn 2.0.114", - "thiserror 2.0.17", + "thiserror 2.0.18", "unicode-ident", ] @@ -7671,6 +8274,15 @@ dependencies = [ "typify-impl", ] +[[package]] +name = "tz-rs" +version = "0.6.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33851b15c848fad2cf4b105c6bb66eb9512b6f6c44a4b13f57c53c73c707e2b4" +dependencies = [ + "const_fn", +] + [[package]] name = "unicase" version = "2.9.0" @@ -7744,6 +8356,7 @@ dependencies = [ "idna", "percent-encoding", "serde", + "serde_derive", ] [[package]] @@ -7788,6 +8401,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.5" @@ -7806,6 +8425,12 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" +[[package]] +name = "waker-fn" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "317211a0dc0ceedd78fb2ca9a44aed3d7b9b26f81870d485c07122b4350673b7" + [[package]] name = "walkdir" version = "2.5.0" @@ -7825,6 +8450,12 @@ dependencies = [ "try-lock", ] +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -7833,18 +8464,18 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasip2" -version = "1.0.1+wasi-0.2.4" +version = "1.0.2+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" dependencies = [ "wit-bindgen", ] [[package]] name = "wasm-bindgen" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d759f433fa64a2d763d1340820e46e111a7a5ab75f993d1852d70b03dbb80fd" +checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" dependencies = [ "cfg-if", "once_cell", @@ -7855,11 +8486,12 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.56" +version = "0.4.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c" +checksum = "70a6e77fd0ae8029c9ea0063f87c46fde723e7d887703d74ad2616d792e51e6f" dependencies = [ "cfg-if", + "futures-util", "js-sys", "once_cell", "wasm-bindgen", @@ -7868,9 +8500,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48cb0d2638f8baedbc542ed444afc0644a29166f1595371af4fecf8ce1e7eeb3" +checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -7878,9 +8510,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cefb59d5cd5f92d9dcf80e4683949f15ca4b511f4ac0a6e14d4e1ac60c6ecd40" +checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" dependencies = [ "bumpalo", "proc-macro2", @@ -7891,9 +8523,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbc538057e648b67f72a982e708d485b2efa771e1ac05fec311f9f63e5800db4" +checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" dependencies = [ "unicode-ident", ] @@ -7913,9 +8545,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.83" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b32828d774c412041098d182a8b38b16ea816958e07cf40eec2bc080ae137ac" +checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" dependencies = [ "js-sys", "wasm-bindgen", @@ -8012,6 +8644,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-registry" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" +dependencies = [ + "windows-link", + "windows-result", + "windows-strings", +] + [[package]] name = "windows-result" version = "0.4.1" @@ -8206,9 +8849,9 @@ dependencies = [ [[package]] name = "wit-bindgen" -version = "0.46.0" +version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" [[package]] name = "wkb" @@ -8398,9 +9041,9 @@ checksum = "40990edd51aae2c2b6907af74ffb635029d5788228222c4bb811e9351c0caad3" [[package]] name = "zmij" -version = "1.0.14" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd8f3f50b848df28f887acb68e41201b5aea6bc8a8dacc00fb40635ff9a72fea" +checksum = "dfcd145825aace48cff44a8844de64bf75feec3080e0aa5cdbde72961ae51a65" [[package]] name = "zstd" diff --git a/python/Cargo.toml b/python/Cargo.toml index 7f6c1caa70f..eb00dfbc05f 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -73,11 +73,15 @@ url = "2.5.0" bytes = "1.4" [features] -default = ["rest", "rest-adapter"] +default = ["rest", "rest-adapter", "credential-vendor-aws", "credential-vendor-gcp", "credential-vendor-azure"] datagen = ["lance-datagen"] fp16kernels = ["lance/fp16kernels"] rest = ["lance-namespace-impls/rest"] rest-adapter = ["lance-namespace-impls/rest-adapter"] +# Credential vending features for DirectoryNamespace +credential-vendor-aws = ["lance-namespace-impls/credential-vendor-aws"] +credential-vendor-gcp = ["lance-namespace-impls/credential-vendor-gcp"] +credential-vendor-azure = ["lance-namespace-impls/credential-vendor-azure"] [profile.ci] debug = "line-tables-only" diff --git a/python/pyproject.toml b/python/pyproject.toml index 60cf4222978..5cf1205c586 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "pylance" dynamic = ["version"] -dependencies = ["pyarrow>=14", "numpy>=1.22", "lance-namespace>=0.2.1"] +dependencies = ["pyarrow>=14", "numpy>=1.22", "lance-namespace>=0.4.5"] description = "python wrapper for Lance columnar format" authors = [{ name = "Lance Devs", email = "dev@lance.org" }] license = { file = "LICENSE" } @@ -64,7 +64,7 @@ tests = [ ] dev = ["ruff==0.4.1", "pyright"] benchmarks = ["pytest-benchmark"] -torch = ["torch"] +torch = ["torch>=2.0"] geo = [ "geoarrow-rust-core", "geoarrow-rust-io", @@ -112,9 +112,13 @@ filterwarnings = [ 'ignore:.*datetime\.datetime\.utcnow\(\) is deprecated.*:DeprecationWarning', # Pandas 2.2 on Python 2.12 'ignore:.*datetime\.datetime\.utcfromtimestamp\(\) is deprecated.*:DeprecationWarning', - # Pytorch 2.2 on Python 2.12 + # Pytorch 2.2 on Python 3.12 'ignore:.*is deprecated and will be removed in Python 3\.14.*:DeprecationWarning', 'ignore:.*The distutils package is deprecated.*:DeprecationWarning', + # Pytorch inductor uses deprecated load_module() in its code cache + 'ignore:.*the load_module\(\) method is deprecated.*:DeprecationWarning', + # Pytorch uses deprecated jit.script_method internally (torch/utils/mkldnn.py) + 'ignore:.*torch\.jit\.script_method.*is deprecated.*:DeprecationWarning', # TensorFlow/Keras import can emit NumPy deprecation FutureWarnings in some environments. # Keep FutureWarnings as errors generally, but ignore this known-noisy import-time warning. 'ignore:.*np\.object.*:FutureWarning', diff --git a/python/python/lance/__init__.py b/python/python/lance/__init__.py index aa05c70286d..83587ffaf72 100644 --- a/python/python/lance/__init__.py +++ b/python/python/lance/__init__.py @@ -95,8 +95,6 @@ def dataset( session: Optional[Session] = None, namespace: Optional[LanceNamespace] = None, table_id: Optional[List[str]] = None, - ignore_namespace_table_storage_options: bool = False, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ) -> LanceDataset: """ Opens the Lance dataset from the address specified. @@ -164,26 +162,13 @@ def dataset( table_id : optional, List[str] The table identifier when using a namespace (e.g., ["my_table"]). Must be provided together with `namespace`. Cannot be used with `uri`. - ignore_namespace_table_storage_options : bool, default False - Only applicable when using `namespace` and `table_id`. If True, storage - options returned from the namespace's describe_table() will be ignored - (treated as None). If False (default), storage options from describe_table() - will be used and a dynamic storage options provider will be created to - automatically refresh credentials before they expire. - s3_credentials_refresh_offset_seconds : optional, int - The number of seconds before credential expiration to trigger a refresh. - Default is 60 seconds. Only applicable when using AWS S3 with temporary - credentials. For example, if set to 60, credentials will be refreshed - when they have less than 60 seconds remaining before expiration. This - should be set shorter than the credential lifetime to avoid using - expired credentials. Notes ----- When using `namespace` and `table_id`: - The `uri` parameter is optional and will be fetched from the namespace - - Storage options from describe_table() will be used unless - `ignore_namespace_table_storage_options=True` + - Storage options from describe_table() will be used automatically + - A dynamic storage options provider will be created to refresh credentials - Initial storage options from describe_table() will be merged with any provided `storage_options` """ @@ -216,10 +201,7 @@ def dataset( if uri is None: raise ValueError("Namespace did not return a 'location' for the table") - if ignore_namespace_table_storage_options: - namespace_storage_options = None - else: - namespace_storage_options = response.storage_options + namespace_storage_options = response.storage_options if namespace_storage_options: storage_options_provider = LanceNamespaceStorageOptionsProvider( @@ -247,7 +229,6 @@ def dataset( read_params=read_params, session=session, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, ) if version is None and asof is not None: ts_cutoff = sanitize_ts(asof) @@ -272,7 +253,6 @@ def dataset( read_params=read_params, session=session, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, ) else: return ds diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index ab40b265a04..2b13b9a8beb 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -422,7 +422,6 @@ def __init__( read_params: Optional[Dict[str, Any]] = None, session: Optional[Session] = None, storage_options_provider: Optional[Any] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ): uri = os.fspath(uri) if isinstance(uri, Path) else uri self._uri = uri @@ -454,7 +453,6 @@ def __init__( read_params=read_params, session=session, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, ) self._default_scan_options = default_scan_options self._read_params = read_params @@ -2224,7 +2222,52 @@ def latest_version(self) -> int: """ return self._ds.latest_version() - def checkout_version(self, version: int | str | Tuple[str, int]) -> "LanceDataset": + @property + def initial_storage_options(self) -> Optional[Dict[str, str]]: + """ + Get the initial storage options used to open this dataset. + + This returns the options that were provided when the dataset was opened, + without any refresh from the provider. Returns None if no storage options + were provided. + """ + return self._ds.initial_storage_options() + + def latest_storage_options(self) -> Optional[Dict[str, str]]: + """ + Get the latest storage options, potentially refreshed from the provider. + + If a storage options provider was configured and credentials are expiring, + this will refresh them. + + Returns + ------- + Optional[Dict[str, str]] + - Storage options dict if configured (static or refreshed from provider) + - None if no storage options were configured for this dataset + + Raises + ------ + IOError + If an error occurs while fetching/refreshing options from the provider + """ + return self._ds.latest_storage_options() + + @property + def storage_options_accessor(self): + """ + Get the storage options accessor for this dataset. + + The accessor bundles static storage options and optional dynamic provider, + handling caching and refresh logic internally. + + Returns None if neither storage options nor a provider were configured. + """ + return self._ds.storage_options_accessor() + + def checkout_version( + self, version: int | str | Tuple[Optional[str], Optional[int]] + ) -> "LanceDataset": """ Load the given version of the dataset. @@ -5352,8 +5395,6 @@ def write_dataset( target_bases: Optional[List[str]] = None, namespace: Optional[LanceNamespace] = None, table_id: Optional[List[str]] = None, - ignore_namespace_table_storage_options: bool = False, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ) -> LanceDataset: """Write a given data_obj to the given uri @@ -5455,29 +5496,16 @@ def write_dataset( table_id : optional, List[str] The table identifier when using a namespace (e.g., ["my_table"]). Must be provided together with `namespace`. Cannot be used with `uri`. - ignore_namespace_table_storage_options : bool, default False - If True, ignore the storage options returned by the namespace and only use - the provided `storage_options` parameter. The storage options provider will - not be created, so credentials will not be automatically refreshed. - This is useful when you want to use your own credentials instead of the - namespace-provided credentials. - s3_credentials_refresh_offset_seconds : optional, int - The number of seconds before credential expiration to trigger a refresh. - Default is 60 seconds. Only applicable when using AWS S3 with temporary - credentials. For example, if set to 60, credentials will be refreshed - when they have less than 60 seconds remaining before expiration. This - should be set shorter than the credential lifetime to avoid using - expired credentials. Notes ----- When using `namespace` and `table_id`: - The `uri` parameter is optional and will be fetched from the namespace + - Storage options from describe_table() will be used automatically - A `LanceNamespaceStorageOptionsProvider` will be created automatically for - storage options refresh (unless `ignore_namespace_table_storage_options=True`) + storage options refresh - Initial storage options from describe_table() will be merged with - any provided `storage_options` (unless - `ignore_namespace_table_storage_options=True`) + any provided `storage_options` """ # Validate that user provides either uri OR (namespace + table_id), not both has_uri = uri is not None @@ -5508,16 +5536,48 @@ def write_dataset( from .namespace import ( CreateEmptyTableRequest, + DeclareTableRequest, DescribeTableRequest, LanceNamespaceStorageOptionsProvider, ) # Determine which namespace method to call based on mode if mode == "create": - request = CreateEmptyTableRequest( - id=table_id, location=None, properties=None - ) - response = namespace.create_empty_table(request) + # Try declare_table first, fall back to deprecated create_empty_table + # for backward compatibility with older namespace implementations. + # create_empty_table support will be removed in 3.0.0. + if hasattr(namespace, "declare_table"): + try: + from lance_namespace.errors import UnsupportedOperationError + + declare_request = DeclareTableRequest(id=table_id, location=None) + response = namespace.declare_table(declare_request) + except (UnsupportedOperationError, NotImplementedError): + # Fall back to deprecated create_empty_table + import warnings + + warnings.warn( + "create_empty_table is deprecated, use declare_table instead. " + "Support will be removed in 3.0.0.", + DeprecationWarning, + stacklevel=2, + ) + fallback_request = CreateEmptyTableRequest( + id=table_id, location=None + ) + response = namespace.create_empty_table(fallback_request) + else: + # Namespace doesn't have declare_table, fall back to create_empty_table + import warnings + + warnings.warn( + "create_empty_table is deprecated, use declare_table instead. " + "Support will be removed in 3.0.0.", + DeprecationWarning, + stacklevel=2, + ) + fallback_request = CreateEmptyTableRequest(id=table_id, location=None) + response = namespace.create_empty_table(fallback_request) elif mode in ("append", "overwrite"): request = DescribeTableRequest(id=table_id, version=None) response = namespace.describe_table(request) @@ -5531,11 +5591,8 @@ def write_dataset( f"Namespace did not return a table location in {mode} response" ) - # Check if we should ignore namespace storage options - if ignore_namespace_table_storage_options: - namespace_storage_options = None - else: - namespace_storage_options = response.storage_options + # Use namespace storage options + namespace_storage_options = response.storage_options # Set up storage options and provider if namespace_storage_options: @@ -5598,12 +5655,6 @@ def write_dataset( if storage_options_provider is not None: params["storage_options_provider"] = storage_options_provider - # Add s3_credentials_refresh_offset_seconds if specified - if s3_credentials_refresh_offset_seconds is not None: - params["s3_credentials_refresh_offset_seconds"] = ( - s3_credentials_refresh_offset_seconds - ) - if commit_lock: if not callable(commit_lock): raise TypeError(f"commit_lock must be a function, got {type(commit_lock)}") diff --git a/python/python/lance/file.py b/python/python/lance/file.py index dec4aea00b6..8a20e4aff2f 100644 --- a/python/python/lance/file.py +++ b/python/python/lance/file.py @@ -68,7 +68,6 @@ def __init__( columns: Optional[List[str]] = None, *, storage_options_provider: Optional[StorageOptionsProvider] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, _inner_reader: Optional[_LanceFileReader] = None, ): """ @@ -86,9 +85,6 @@ def __init__( storage_options_provider : optional A provider that can provide storage options dynamically. This is useful for credentials that need to be refreshed or vended on-demand. - s3_credentials_refresh_offset_seconds : optional, int - How early (in seconds) before expiration to refresh S3 credentials. - Default is 60 seconds. Only applies when using storage_options_provider. columns: list of str, default None List of column names to be fetched. All columns are fetched if None or unspecified. @@ -102,7 +98,6 @@ def __init__( path, storage_options=storage_options, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, columns=columns, ) @@ -219,7 +214,6 @@ def __init__( base_path: str, storage_options: Optional[Dict[str, str]] = None, storage_options_provider: Optional[StorageOptionsProvider] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ): """ Creates a new file session @@ -236,9 +230,6 @@ def __init__( storage_options_provider : optional A provider that can provide storage options dynamically. This is useful for credentials that need to be refreshed or vended on-demand. - s3_credentials_refresh_offset_seconds : optional, int - How early (in seconds) before expiration to refresh S3 credentials. - Default is 60 seconds. Only applies when using storage_options_provider. """ if isinstance(base_path, Path): base_path = str(base_path) @@ -246,7 +237,6 @@ def __init__( base_path, storage_options=storage_options, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, ) def open_reader( @@ -391,7 +381,6 @@ def __init__( version: Optional[str] = None, storage_options: Optional[Dict[str, str]] = None, storage_options_provider: Optional[StorageOptionsProvider] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, max_page_bytes: Optional[int] = None, _inner_writer: Optional[_LanceFileWriter] = None, **kwargs, @@ -422,9 +411,6 @@ def __init__( A storage options provider that can fetch and refresh storage options dynamically. This is useful for credentials that expire and need to be refreshed automatically. - s3_credentials_refresh_offset_seconds : optional, int - How early (in seconds) before expiration to refresh S3 credentials. - Default is 60 seconds. Only applies when using storage_options_provider. max_page_bytes : optional, int The maximum size of a page in bytes, if a single array would create a page larger than this then it will be split into multiple pages. The @@ -442,7 +428,6 @@ def __init__( version=version, storage_options=storage_options, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, max_page_bytes=max_page_bytes, **kwargs, ) diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index f0cf1243d61..9ecc271754f 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -95,7 +95,6 @@ class LanceFileWriter: version: Optional[str], storage_options: Optional[Dict[str, str]], storage_options_provider: Optional[StorageOptionsProvider], - s3_credentials_refresh_offset_seconds: Optional[int], keep_original_array: Optional[bool], max_page_bytes: Optional[int], ): ... @@ -110,7 +109,6 @@ class LanceFileSession: base_path: str, storage_options: Optional[Dict[str, str]] = None, storage_options_provider: Optional[StorageOptionsProvider] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ): ... def open_reader( self, path: str, columns: Optional[List[str]] = None @@ -135,7 +133,6 @@ class LanceFileReader: path: str, storage_options: Optional[Dict[str, str]], storage_options_provider: Optional[StorageOptionsProvider], - s3_credentials_refresh_offset_seconds: Optional[int], columns: Optional[List[str]] = None, ): ... def read_all( diff --git a/python/python/lance/namespace.py b/python/python/lance/namespace.py index 426c7176d74..8796414daca 100644 --- a/python/python/lance/namespace.py +++ b/python/python/lance/namespace.py @@ -7,11 +7,13 @@ 1. Native Rust-backed namespace implementations (DirectoryNamespace, RestNamespace) 2. Storage options integration with LanceNamespace for automatic credential refresh 3. Plugin registry for external namespace implementations +4. Dynamic context provider registry for per-request context injection The LanceNamespace ABC interface is provided by the lance_namespace package. """ -from typing import Dict, List +from abc import ABC, abstractmethod +from typing import Dict, List, Optional from lance_namespace import ( CreateEmptyTableRequest, @@ -20,6 +22,8 @@ CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, + DeclareTableRequest, + DeclareTableResponse, DeregisterTableRequest, DeregisterTableResponse, DescribeNamespaceRequest, @@ -59,9 +63,148 @@ "RestNamespace", "RestAdapter", "LanceNamespaceStorageOptionsProvider", + "DynamicContextProvider", ] +# ============================================================================= +# Dynamic Context Provider +# ============================================================================= + + +class DynamicContextProvider(ABC): + """Abstract base class for dynamic context providers. + + Implementations provide per-request context (e.g., authentication headers) + based on the operation being performed. The provider is called synchronously + before each namespace operation. + + For RestNamespace, context keys that start with `headers.` are converted to + HTTP headers by stripping the prefix. For example, `{"headers.Authorization": + "Bearer token"}` becomes the `Authorization: Bearer token` header. + + Example + ------- + >>> # Define a provider class + >>> class MyProvider(DynamicContextProvider): + ... def __init__(self, api_key: str): + ... self.api_key = api_key + ... + ... def provide_context(self, info: dict) -> dict: + ... return { + ... "headers.Authorization": f"Bearer {self.api_key}", + ... } + ... + >>> # Create provider instance and use directly + >>> provider = MyProvider(api_key="secret") + >>> provider.provide_context({"operation": "list_tables", "object_id": "ns"}) + {'headers.Authorization': 'Bearer secret'} + """ + + @abstractmethod + def provide_context(self, info: Dict[str, str]) -> Dict[str, str]: + """Provide context for a namespace operation. + + Parameters + ---------- + info : dict + Information about the operation: + - operation: The operation name (e.g., "list_tables", "describe_table") + - object_id: The object identifier (namespace or table ID) + + Returns + ------- + dict + Context key-value pairs. For HTTP headers, use keys with the + "headers." prefix (e.g., "headers.Authorization"). + """ + pass + + +def _create_context_provider_from_properties( + properties: Dict[str, str], +) -> Optional[DynamicContextProvider]: + """Create a context provider instance from properties. + + Extracts `dynamic_context_provider.*` properties and creates a provider + instance by dynamically loading the class from the given class path. + + Parameters + ---------- + properties : dict + The full properties dict that may contain dynamic_context_provider.* keys. + + Returns + ------- + DynamicContextProvider or None + The created provider instance, or None if no provider is configured. + + Raises + ------ + ValueError + If dynamic_context_provider.impl is set but the class cannot be loaded. + """ + import importlib + + prefix = "dynamic_context_provider." + impl_key = "dynamic_context_provider.impl" + + impl_path = properties.get(impl_key) + if not impl_path: + return None + + # Parse the class path (e.g., "my_module.submodule.MyClass") + if "." not in impl_path: + raise ValueError( + f"Invalid context provider class path '{impl_path}'. " + f"Expected format: 'module.ClassName' (e.g., 'my_module.MyProvider')" + ) + + module_path, class_name = impl_path.rsplit(".", 1) + + try: + module = importlib.import_module(module_path) + provider_class = getattr(module, class_name) + except ModuleNotFoundError as e: + raise ValueError( + f"Failed to import module '{module_path}' for context provider: {e}" + ) from e + except AttributeError as e: + raise ValueError( + f"Class '{class_name}' not found in module '{module_path}': {e}" + ) from e + + # Extract provider-specific properties (strip prefix, exclude impl key) + provider_props = {} + for key, value in properties.items(): + if key.startswith(prefix) and key != impl_key: + prop_name = key[len(prefix) :] + provider_props[prop_name] = value + + # Create the provider instance + return provider_class(**provider_props) + + +def _filter_context_provider_properties(properties: Dict[str, str]) -> Dict[str, str]: + """Remove dynamic_context_provider.* properties from the dict. + + These properties are handled at the Python level and should not be + passed to the Rust layer. + + Parameters + ---------- + properties : dict + The full properties dict. + + Returns + ------- + dict + Properties with dynamic_context_provider.* keys removed. + """ + prefix = "dynamic_context_provider." + return {k: v for k, v in properties.items() if not k.startswith(prefix)} + + class DirectoryNamespace(LanceNamespace): """Directory-based Lance Namespace implementation backed by Rust. @@ -86,6 +229,40 @@ class DirectoryNamespace(LanceNamespace): (e.g., storage.region="us-west-2" becomes region="us-west-2" in storage options) + Credential vendor properties (vendor is auto-selected based on table location): + When credential vendor properties are configured, describe_table() will + return vended temporary credentials. The vendor type is auto-selected + based on table location URI: s3:// for AWS, gs:// for GCP, az:// for + Azure. Requires the corresponding credential-vendor-* feature. + + Common properties: + - credential_vendor.enabled (required): Set to "true" to enable + - credential_vendor.permission (optional): read, write, or admin + + AWS-specific properties (for s3:// locations): + - credential_vendor.aws_role_arn (required): IAM role ARN to assume + - credential_vendor.aws_external_id (optional): External ID + - credential_vendor.aws_region (optional): AWS region + - credential_vendor.aws_role_session_name (optional): Session name + - credential_vendor.aws_duration_millis (optional): Duration in ms + (default: 3600000, range: 15min-12hrs) + + GCP-specific properties (for gs:// locations): + - credential_vendor.gcp_service_account (optional): Service account + to impersonate using IAM Credentials API + + Note: GCP uses Application Default Credentials (ADC). To use a service + account key file, set the GOOGLE_APPLICATION_CREDENTIALS environment + variable before starting. GCP token duration cannot be configured; + it's determined by the STS endpoint (typically 1 hour). + + Azure-specific properties (for az:// locations): + - credential_vendor.azure_account_name (required): Azure storage + account name + - credential_vendor.azure_tenant_id (optional): Azure tenant ID + - credential_vendor.azure_duration_millis (optional): Duration in ms + (default: 3600000, up to 7 days) + Examples -------- >>> import lance.namespace @@ -95,14 +272,49 @@ class DirectoryNamespace(LanceNamespace): >>> # Using the connect() factory function from lance_namespace >>> import lance_namespace >>> ns = lance_namespace.connect("dir", {"root": "memory://test"}) + >>> + >>> # With AWS credential vending (requires credential-vendor-aws feature) + >>> # Use **dict to pass property names with dots + >>> ns = lance.namespace.DirectoryNamespace(**{ + ... "root": "s3://my-bucket/data", + ... "credential_vendor.enabled": "true", + ... "credential_vendor.aws_role_arn": "arn:aws:iam::123456789012:role/MyRole", + ... "credential_vendor.aws_duration_millis": "3600000", + ... }) + + With dynamic context provider: + + >>> import tempfile + >>> class MyProvider(DynamicContextProvider): + ... def __init__(self, token: str): + ... self.token = token + ... def provide_context(self, info: dict) -> dict: + ... return {"headers.Authorization": f"Bearer {self.token}"} + ... + >>> provider = MyProvider(token="secret-token") + >>> with tempfile.TemporaryDirectory() as tmpdir: + ... ns = lance.namespace.DirectoryNamespace( + ... root=tmpdir, + ... context_provider=provider, + ... ) + ... _ = ns.namespace_id() # verify it works """ - def __init__(self, session=None, **properties): + def __init__(self, session=None, context_provider=None, **properties): # Convert all values to strings as expected by Rust from_properties str_properties = {str(k): str(v) for k, v in properties.items()} + # Create context provider from properties if configured + if context_provider is None: + context_provider = _create_context_provider_from_properties(str_properties) + + # Filter out dynamic_context_provider.* properties before passing to Rust + filtered_properties = _filter_context_provider_properties(str_properties) + # Create the underlying Rust namespace - self._inner = PyDirectoryNamespace(session=session, **str_properties) + self._inner = PyDirectoryNamespace( + session=session, context_provider=context_provider, **filtered_properties + ) def namespace_id(self) -> str: """Return a human-readable unique identifier for this namespace instance.""" @@ -175,6 +387,10 @@ def create_empty_table( response_dict = self._inner.create_empty_table(request.model_dump()) return CreateEmptyTableResponse.from_dict(response_dict) + def declare_table(self, request: DeclareTableRequest) -> DeclareTableResponse: + response_dict = self._inner.declare_table(request.model_dump()) + return DeclareTableResponse.from_dict(response_dict) + class RestNamespace(LanceNamespace): """REST-based Lance Namespace implementation backed by Rust. @@ -205,9 +421,25 @@ class RestNamespace(LanceNamespace): >>> # Using the connect() factory function from lance_namespace >>> import lance_namespace >>> ns = lance_namespace.connect("rest", {"uri": "http://localhost:4099"}) + + With dynamic context provider: + + >>> class AuthProvider(DynamicContextProvider): + ... def __init__(self, api_key: str): + ... self.api_key = api_key + ... def provide_context(self, info: dict) -> dict: + ... return {"headers.Authorization": f"Bearer {self.api_key}"} + ... + >>> provider = AuthProvider(api_key="my-secret-key") + >>> ns = lance.namespace.RestNamespace( + ... uri="http://localhost:4099", + ... context_provider=provider, + ... ) + >>> ns.namespace_id() # verify it works + 'RestNamespace { endpoint: "http://localhost:4099", delimiter: "$" }' """ - def __init__(self, **properties): + def __init__(self, context_provider=None, **properties): if PyRestNamespace is None: raise RuntimeError( "RestNamespace is not available. " @@ -217,8 +449,17 @@ def __init__(self, **properties): # Convert all values to strings as expected by Rust from_properties str_properties = {str(k): str(v) for k, v in properties.items()} + # Create context provider from properties if configured + if context_provider is None: + context_provider = _create_context_provider_from_properties(str_properties) + + # Filter out dynamic_context_provider.* properties before passing to Rust + filtered_properties = _filter_context_provider_properties(str_properties) + # Create the underlying Rust namespace - self._inner = PyRestNamespace(**str_properties) + self._inner = PyRestNamespace( + context_provider=context_provider, **filtered_properties + ) def namespace_id(self) -> str: """Return a human-readable unique identifier for this namespace instance.""" @@ -291,6 +532,10 @@ def create_empty_table( response_dict = self._inner.create_empty_table(request.model_dump()) return CreateEmptyTableResponse.from_dict(response_dict) + def declare_table(self, request: DeclareTableRequest) -> DeclareTableResponse: + response_dict = self._inner.declare_table(request.model_dump()) + return DeclareTableResponse.from_dict(response_dict) + class RestAdapter: """REST adapter server that creates a namespace backend and exposes it via REST. diff --git a/python/python/lance/torch/distance.py b/python/python/lance/torch/distance.py index 06388210544..3c9becfd749 100644 --- a/python/python/lance/torch/distance.py +++ b/python/python/lance/torch/distance.py @@ -16,7 +16,7 @@ ] -@torch.jit.script +@torch.compile def _pairwise_cosine( x: torch.Tensor, y: torch.Tensor, y2: torch.Tensor ) -> torch.Tensor: @@ -49,7 +49,7 @@ def pairwise_cosine( return _pairwise_cosine(x, y, y2) -@torch.jit.script +@torch.compile def _cosine_distance( vectors: torch.Tensor, centroids: torch.Tensor, split_size: int ) -> Tuple[torch.Tensor, torch.Tensor]: @@ -114,7 +114,7 @@ def cosine_distance( raise RuntimeError("Cosine distance out of memory") -@torch.jit.script +@torch.compile def argmin_l2(x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: x = x.reshape(1, x.shape[0], -1) y = y.reshape(1, y.shape[0], -1) @@ -125,7 +125,7 @@ def argmin_l2(x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.Ten return min_dists.pow(2), idx -@torch.jit.script +@torch.compile def pairwise_l2( x: torch.Tensor, y: torch.Tensor, y2: Optional[torch.Tensor] = None ) -> torch.Tensor: @@ -170,7 +170,7 @@ def pairwise_l2( return dists.type(origin_dtype) -@torch.jit.script +@torch.compile def _l2_distance( x: torch.Tensor, y: torch.Tensor, @@ -237,7 +237,7 @@ def l2_distance( raise RuntimeError("L2 distance out of memory") -@torch.jit.script +@torch.compile def dot_distance(x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """Pair-wise dot distance between two 2-D Tensors. diff --git a/python/python/tests/test_namespace_integration.py b/python/python/tests/test_namespace_integration.py index 592bbd2c3ef..30489496e38 100644 --- a/python/python/tests/test_namespace_integration.py +++ b/python/python/tests/test_namespace_integration.py @@ -22,6 +22,8 @@ from lance.namespace import ( CreateEmptyTableRequest, CreateEmptyTableResponse, + DeclareTableRequest, + DeclareTableResponse, DescribeTableRequest, DescribeTableResponse, LanceNamespace, @@ -126,6 +128,8 @@ def _modify_storage_options( (time.time() + self.credential_expires_in_seconds) * 1000 ) modified["expires_at_millis"] = str(expires_at_millis) + # Set refresh offset to 1 second (1000ms) for short-lived credential tests + modified["refresh_offset_millis"] = "1000" return modified @@ -143,6 +147,18 @@ def create_empty_table( return response + def declare_table(self, request: DeclareTableRequest) -> DeclareTableResponse: + with self.lock: + self.create_call_count += 1 + count = self.create_call_count + + response = self.inner.declare_table(request) + response.storage_options = self._modify_storage_options( + response.storage_options, count + ) + + return response + def describe_table(self, request: DescribeTableRequest) -> DescribeTableResponse: with self.lock: self.describe_call_count += 1 @@ -221,7 +237,6 @@ def test_namespace_with_refresh(s3_bucket: str): namespace=namespace, table_id=table_id, mode="create", - s3_credentials_refresh_offset_seconds=1, ) assert ds.count_rows() == 2 assert namespace.get_create_call_count() == 1 @@ -229,7 +244,6 @@ def test_namespace_with_refresh(s3_bucket: str): ds_from_namespace = lance.dataset( namespace=namespace, table_id=table_id, - s3_credentials_refresh_offset_seconds=1, ) initial_call_count = namespace.get_describe_call_count() @@ -434,8 +448,8 @@ def test_namespace_distributed_write(s3_bucket: str): table_name = uuid.uuid4().hex table_id = ["test_ns", table_name] - request = CreateEmptyTableRequest(id=table_id, location=None, properties=None) - response = namespace.create_empty_table(request) + request = DeclareTableRequest(id=table_id, location=None) + response = namespace.declare_table(request) assert namespace.get_create_call_count() == 1 assert namespace.get_describe_call_count() == 0 @@ -560,7 +574,6 @@ def test_file_writer_with_storage_options_provider(s3_bucket: str): schema=schema, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) batch = pa.RecordBatch.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6]}, schema=schema) @@ -579,7 +592,6 @@ def test_file_writer_with_storage_options_provider(s3_bucket: str): file_uri, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) result = reader.read_all(batch_size=1024) result_table = result.to_table() @@ -599,7 +611,6 @@ def test_file_writer_with_storage_options_provider(s3_bucket: str): schema=schema, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) batch3 = pa.RecordBatch.from_pydict( @@ -615,7 +626,6 @@ def test_file_writer_with_storage_options_provider(s3_bucket: str): file_uri2, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) result2 = reader2.read_all(batch_size=1024) result_table2 = result2.to_table() @@ -682,7 +692,6 @@ def test_file_reader_with_storage_options_provider(s3_bucket: str): file_uri, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) result = reader.read_all(batch_size=1024) result_table = result.to_table() @@ -713,7 +722,6 @@ def test_file_reader_with_storage_options_provider(s3_bucket: str): file_uri2, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) result2 = reader2.read_all(batch_size=1024) result_table2 = result2.to_table() @@ -764,7 +772,6 @@ def test_file_session_with_storage_options_provider(s3_bucket: str): f"s3://{s3_bucket}/{table_name}_session", storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) # Test contains method diff --git a/python/python/tests/test_namespace_rest.py b/python/python/tests/test_namespace_rest.py index 7fa3a65c5f1..de1a57ace8d 100644 --- a/python/python/tests/test_namespace_rest.py +++ b/python/python/tests/test_namespace_rest.py @@ -680,3 +680,66 @@ def test_connect_with_custom_delimiter(self): ipc_data = table_to_ipc_bytes(table_data) response = ns.create_table(create_req, ipc_data) assert response is not None + + +class TestDynamicContextProvider: + """Tests for DynamicContextProvider with RestNamespace.""" + + def test_rest_namespace_with_explicit_provider(self): + """Test RestNamespace with an explicit context provider.""" + call_count = {"count": 0} + + class TestProvider(lance.namespace.DynamicContextProvider): + def provide_context(self, info): + call_count["count"] += 1 + return { + "headers.Authorization": "Bearer test-token", + "headers.X-Request-Id": f"req-{info.get('operation', 'unknown')}", + } + + with tempfile.TemporaryDirectory() as tmpdir: + backend_config = {"root": tmpdir} + + with lance.namespace.RestAdapter("dir", backend_config, port=0) as adapter: + ns = lance.namespace.RestNamespace( + uri=f"http://127.0.0.1:{adapter.port}", + context_provider=TestProvider(), + ) + + # Perform operations + create_req = CreateNamespaceRequest(id=["workspace"]) + ns.create_namespace(create_req) + + list_req = ListNamespacesRequest(id=[]) + ns.list_namespaces(list_req) + + # Context provider should have been called + assert call_count["count"] >= 2 + + def test_explicit_provider_takes_precedence(self): + """Test that explicit provider takes precedence over class path.""" + explicit_called = {"called": False} + + class ExplicitProvider(lance.namespace.DynamicContextProvider): + def provide_context(self, info): + explicit_called["called"] = True + return {"headers.Authorization": "Bearer explicit"} + + with tempfile.TemporaryDirectory() as tmpdir: + backend_config = {"root": tmpdir} + + with lance.namespace.RestAdapter("dir", backend_config, port=0) as adapter: + # Pass both explicit provider and class path - explicit should win + ns = lance.namespace.RestNamespace( + context_provider=ExplicitProvider(), + **{ + "uri": f"http://127.0.0.1:{adapter.port}", + "dynamic_context_provider.impl": "nonexistent.Provider", + }, + ) + + create_req = CreateNamespaceRequest(id=["workspace"]) + ns.create_namespace(create_req) + + # Explicit provider should have been used + assert explicit_called["called"] diff --git a/python/src/dataset.rs b/python/src/dataset.rs index ade2b4516ca..37334ad9352 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -91,6 +91,7 @@ use crate::rt; use crate::scanner::ScanStatistics; use crate::schema::{logical_schema_from_lance, LanceSchema}; use crate::session::Session; +use crate::storage_options::PyStorageOptionsAccessor; use crate::utils::PyLance; use crate::{LanceReader, Scanner}; @@ -456,8 +457,9 @@ pub struct Dataset { #[pymethods] impl Dataset { #[allow(clippy::too_many_arguments)] + #[allow(deprecated)] #[new] - #[pyo3(signature=(uri, version=None, block_size=None, index_cache_size=None, metadata_cache_size=None, commit_handler=None, storage_options=None, manifest=None, metadata_cache_size_bytes=None, index_cache_size_bytes=None, read_params=None, session=None, storage_options_provider=None, s3_credentials_refresh_offset_seconds=None))] + #[pyo3(signature=(uri, version=None, block_size=None, index_cache_size=None, metadata_cache_size=None, commit_handler=None, storage_options=None, manifest=None, metadata_cache_size_bytes=None, index_cache_size_bytes=None, read_params=None, session=None, storage_options_provider=None))] fn new( py: Python, uri: String, @@ -472,8 +474,7 @@ impl Dataset { index_cache_size_bytes: Option, read_params: Option<&Bound>, session: Option, - storage_options_provider: Option, - s3_credentials_refresh_offset_seconds: Option, + storage_options_provider: Option<&Bound<'_, PyAny>>, ) -> PyResult { let mut params = ReadParams::default(); if let Some(metadata_cache_size_bytes) = metadata_cache_size_bytes { @@ -490,16 +491,12 @@ impl Dataset { let index_cache_size_bytes = index_cache_size * 20 * 1024 * 1024; params.index_cache_size_bytes(index_cache_size_bytes); } - // Set up store options (block size and S3 credentials refresh offset) - let mut store_params = params.store_options.take().unwrap_or_default(); + // Set up store options (block size) if let Some(block_size) = block_size { + let mut store_params = params.store_options.take().unwrap_or_default(); store_params.block_size = Some(block_size); + params.store_options = Some(store_params); } - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - store_params.s3_credentials_refresh_offset = - std::time::Duration::from_secs(offset_seconds); - } - params.store_options = Some(store_params); if let Some(commit_handler) = commit_handler { let py_commit_lock = PyCommitLock::new(commit_handler); params.set_commit_lock(Arc::new(py_commit_lock)); @@ -1447,8 +1444,40 @@ impl Dataset { .map_err(|err| PyIOError::new_err(err.to_string())) } - fn checkout_version(&self, py: Python, version: PyObject) -> PyResult { - let reference = self.transform_ref(py, Some(version))?; + /// Get the initial storage options used to open this dataset. + /// + /// This returns the options that were provided when the dataset was opened, + /// without any refresh from the provider. Returns None if no storage options + /// were provided. + fn initial_storage_options(&self) -> Option> { + self.ds.initial_storage_options().cloned() + } + + /// Get the latest storage options, potentially refreshed from the provider. + /// + /// If a storage options provider was configured and credentials are expiring, + /// this will refresh them. Returns the current valid storage options, or None + /// if no storage options accessor is configured. + fn latest_storage_options(self_: PyRef<'_, Self>) -> PyResult>> { + let result = rt() + .block_on(Some(self_.py()), self_.ds.latest_storage_options())? + .map_err(|err| PyIOError::new_err(err.to_string()))?; + Ok(result.map(|opts| opts.0)) + } + + /// Get the storage options accessor for this dataset. + /// + /// The accessor bundles static storage options and optional dynamic provider, + /// handling caching and refresh logic internally. + fn storage_options_accessor(&self) -> Option { + self.ds + .storage_options_accessor() + .map(PyStorageOptionsAccessor::new) + } + + fn checkout_version(&self, version: Bound<'_, PyAny>) -> PyResult { + let reference = + Python::with_gil(|py| self.transform_ref(py, Some(version.clone().unbind())))?; self._checkout_version(reference) } @@ -1465,7 +1494,9 @@ impl Dataset { // `version` can be a version number or a tag name. // `storage_options` will be forwarded to the object store params for the new dataset. let store_params = storage_options.as_ref().map(|opts| ObjectStoreParams { - storage_options: Some(opts.clone()), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(opts.clone()), + )), ..Default::default() }); @@ -1647,7 +1678,9 @@ impl Dataset { // Build Ref from python object let reference = self.transform_ref(py, reference)?; let store_params = storage_options.map(|opts| ObjectStoreParams { - storage_options: Some(opts), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(opts), + )), ..Default::default() }); let created = rt() @@ -2099,7 +2132,7 @@ impl Dataset { read_version: Option, commit_lock: Option<&Bound<'_, PyAny>>, storage_options: Option>, - storage_options_provider: Option, + storage_options_provider: Option<&Bound<'_, PyAny>>, enable_v2_manifest_paths: Option, detached: Option, max_retries: Option, @@ -2127,6 +2160,7 @@ impl Dataset { } #[allow(clippy::too_many_arguments)] + #[allow(deprecated)] #[staticmethod] #[pyo3(signature = (dest, transaction, commit_lock = None, storage_options = None, storage_options_provider = None, enable_v2_manifest_paths = None, detached = None, max_retries = None))] fn commit_transaction( @@ -2134,25 +2168,19 @@ impl Dataset { transaction: PyLance, commit_lock: Option<&Bound<'_, PyAny>>, storage_options: Option>, - storage_options_provider: Option, + storage_options_provider: Option<&Bound<'_, PyAny>>, enable_v2_manifest_paths: Option, detached: Option, max_retries: Option, ) -> PyResult { - let provider = storage_options_provider.and_then(|py_obj| { - crate::storage_options::PyStorageOptionsProvider::new(py_obj) - .ok() - .map(|py_provider| { - Arc::new( - crate::storage_options::PyStorageOptionsProviderWrapper::new(py_provider), - ) as Arc - }) - }); + let accessor = crate::storage_options::create_accessor_from_python( + storage_options.clone(), + storage_options_provider, + )?; - let object_store_params = if storage_options.is_some() || provider.is_some() { + let object_store_params = if accessor.is_some() { Some(ObjectStoreParams { - storage_options: storage_options.clone(), - storage_options_provider: provider, + storage_options_accessor: accessor, ..Default::default() }) } else { @@ -2196,6 +2224,7 @@ impl Dataset { } #[allow(clippy::too_many_arguments)] + #[allow(deprecated)] #[staticmethod] #[pyo3(signature = (dest, transactions, commit_lock = None, storage_options = None, storage_options_provider = None, enable_v2_manifest_paths = None, detached = None, max_retries = None))] fn commit_batch( @@ -2203,25 +2232,19 @@ impl Dataset { transactions: Vec>, commit_lock: Option<&Bound<'_, PyAny>>, storage_options: Option>, - storage_options_provider: Option, + storage_options_provider: Option<&Bound<'_, PyAny>>, enable_v2_manifest_paths: Option, detached: Option, max_retries: Option, ) -> PyResult<(Self, PyLance)> { - let provider = storage_options_provider.and_then(|py_obj| { - crate::storage_options::PyStorageOptionsProvider::new(py_obj) - .ok() - .map(|py_provider| { - Arc::new( - crate::storage_options::PyStorageOptionsProviderWrapper::new(py_provider), - ) as Arc - }) - }); + let accessor = crate::storage_options::create_accessor_from_python( + storage_options.clone(), + storage_options_provider, + )?; - let object_store_params = if storage_options.is_some() || provider.is_some() { + let object_store_params = if accessor.is_some() { Some(ObjectStoreParams { - storage_options: storage_options.clone(), - storage_options_provider: provider, + storage_options_accessor: accessor, ..Default::default() }) } else { @@ -3006,6 +3029,7 @@ fn get_dict_opt<'a, 'py, D: FromPyObject<'a>>( .transpose() } +#[allow(deprecated)] pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult> { let params = if options.is_none() { None @@ -3033,34 +3057,17 @@ pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult>(options, "storage_options")?; let storage_options_provider = - get_dict_opt::(options, "storage_options_provider")?.and_then(|py_obj| { - crate::storage_options::PyStorageOptionsProvider::new(py_obj) - .ok() - .map(|py_provider| { - Arc::new( - crate::storage_options::PyStorageOptionsProviderWrapper::new( - py_provider, - ), - ) - as Arc - }) - }); - - let s3_credentials_refresh_offset_seconds = - get_dict_opt::(options, "s3_credentials_refresh_offset_seconds")?; - - if storage_options.is_some() - || storage_options_provider.is_some() - || s3_credentials_refresh_offset_seconds.is_some() - { - let s3_credentials_refresh_offset = s3_credentials_refresh_offset_seconds - .map(std::time::Duration::from_secs) - .unwrap_or(std::time::Duration::from_secs(60)); + get_dict_opt::>(options, "storage_options_provider")?; - p.store_params = Some(ObjectStoreParams { + if storage_options.is_some() || storage_options_provider.is_some() { + let accessor = crate::storage_options::create_accessor_from_python( storage_options, - storage_options_provider, - s3_credentials_refresh_offset, + storage_options_provider + .as_ref() + .map(|py_obj| py_obj.bind(options.py())), + )?; + p.store_params = Some(ObjectStoreParams { + storage_options_accessor: accessor, ..Default::default() }); } diff --git a/python/src/error.rs b/python/src/error.rs index ab12bead1e2..45569331289 100644 --- a/python/src/error.rs +++ b/python/src/error.rs @@ -12,13 +12,49 @@ // See the License for the specific language governing permissions and // limitations under the License. +use lance_namespace::error::NamespaceError; use pyo3::{ exceptions::{PyIOError, PyNotImplementedError, PyRuntimeError, PyValueError}, - PyResult, + types::{PyAnyMethods, PyModule}, + BoundObject, PyErr, PyResult, Python, }; use lance::Error as LanceError; +/// Try to convert a NamespaceError to the corresponding Python exception. +/// Returns the appropriate Python exception from lance_namespace.errors module. +fn namespace_error_to_pyerr(py: Python<'_>, ns_err: &NamespaceError) -> PyErr { + let code = ns_err.code().as_u32(); + let message = ns_err.to_string(); + + // Try to import the lance_namespace.errors module and use from_error_code + match PyModule::import(py, "lance_namespace.errors") { + Ok(module) => { + match module.getattr("from_error_code") { + Ok(from_error_code) => { + match from_error_code.call1((code, message.clone())) { + Ok(exc) => { + // Create a PyErr from the exception object + PyErr::from_value(exc.into_bound()) + } + Err(_) => PyRuntimeError::new_err(format!( + "[NamespaceError code={}] {}", + code, message + )), + } + } + Err(_) => { + PyRuntimeError::new_err(format!("[NamespaceError code={}] {}", code, message)) + } + } + } + Err(_) => { + // lance_namespace module not available, use RuntimeError with code prefix + PyRuntimeError::new_err(format!("[NamespaceError code={}] {}", code, message)) + } + } +} + pub trait PythonErrorExt { /// Convert to a python error based on the Lance error type fn infer_error(self) -> PyResult; @@ -43,7 +79,19 @@ impl PythonErrorExt for std::result::Result { LanceError::NotFound { .. } => self.value_error(), LanceError::RefNotFound { .. } => self.value_error(), LanceError::VersionNotFound { .. } => self.value_error(), - + LanceError::Namespace { source, .. } => { + // Try to downcast to NamespaceError and convert to proper Python exception + if let Some(ns_err) = source.downcast_ref::() { + Python::with_gil(|py| Err(namespace_error_to_pyerr(py, ns_err))) + } else { + log::warn!( + "Failed to downcast NamespaceError source, falling back to runtime error. \ + This may indicate a version mismatch. Source type: {:?}", + source + ); + self.runtime_error() + } + } _ => self.runtime_error(), }, } diff --git a/python/src/file.rs b/python/src/file.rs index 11971e5d5d7..213f3e2f71c 100644 --- a/python/src/file.rs +++ b/python/src/file.rs @@ -37,7 +37,8 @@ use lance_io::{ use object_store::path::Path; use pyo3::{ exceptions::{PyIOError, PyRuntimeError}, - pyclass, pyfunction, pymethods, IntoPyObjectExt, PyErr, PyObject, PyResult, Python, + pyclass, pyfunction, pymethods, Bound, IntoPyObjectExt, PyAny, PyErr, PyObject, PyResult, + Python, }; use serde::Serialize; use std::collections::HashMap; @@ -239,7 +240,6 @@ impl LanceFileWriter { version: Option, storage_options: Option>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, keep_original_array: Option, max_page_bytes: Option, ) -> PyResult { @@ -247,7 +247,6 @@ impl LanceFileWriter { uri_or_path, storage_options, storage_options_provider, - s3_credentials_refresh_offset_seconds, ) .await?; Self::open_with_store( @@ -297,7 +296,7 @@ impl LanceFileWriter { #[pymethods] impl LanceFileWriter { #[new] - #[pyo3(signature=(path, schema=None, data_cache_bytes=None, version=None, storage_options=None, storage_options_provider=None, s3_credentials_refresh_offset_seconds=None, keep_original_array=None, max_page_bytes=None))] + #[pyo3(signature=(path, schema=None, data_cache_bytes=None, version=None, storage_options=None, storage_options_provider=None, keep_original_array=None, max_page_bytes=None))] #[allow(clippy::too_many_arguments)] pub fn new( path: String, @@ -305,8 +304,7 @@ impl LanceFileWriter { data_cache_bytes: Option, version: Option, storage_options: Option>, - storage_options_provider: Option, - s3_credentials_refresh_offset_seconds: Option, + storage_options_provider: Option<&Bound<'_, PyAny>>, keep_original_array: Option, max_page_bytes: Option, ) -> PyResult { @@ -324,7 +322,6 @@ impl LanceFileWriter { version, storage_options, provider, - s3_credentials_refresh_offset_seconds, keep_original_array, max_page_bytes, ), @@ -381,25 +378,33 @@ pub async fn object_store_from_uri_or_path( uri_or_path: impl AsRef, storage_options: Option>, ) -> PyResult<(Arc, Path)> { - object_store_from_uri_or_path_with_provider(uri_or_path, storage_options, None, None).await + object_store_from_uri_or_path_with_provider(uri_or_path, storage_options, None).await } pub async fn object_store_from_uri_or_path_with_provider( uri_or_path: impl AsRef, storage_options: Option>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, ) -> PyResult<(Arc, Path)> { let object_store_registry = Arc::new(lance::io::ObjectStoreRegistry::default()); - let mut object_store_params = ObjectStoreParams { - storage_options: storage_options.clone(), - storage_options_provider, + + let accessor = match (storage_options, storage_options_provider) { + (Some(opts), Some(provider)) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_initial_and_provider(opts, provider), + )), + (None, Some(provider)) => Some(Arc::new(lance::io::StorageOptionsAccessor::with_provider( + provider, + ))), + (Some(opts), None) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(opts), + )), + (None, None) => None, + }; + + let object_store_params = ObjectStoreParams { + storage_options_accessor: accessor, ..Default::default() }; - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - object_store_params.s3_credentials_refresh_offset = - std::time::Duration::from_secs(offset_seconds); - } let (object_store, path) = ObjectStore::from_uri_and_params( object_store_registry, @@ -423,13 +428,11 @@ impl LanceFileSession { uri_or_path: String, storage_options: Option>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, ) -> PyResult { let (object_store, base_path) = object_store_from_uri_or_path_with_provider( uri_or_path, storage_options, storage_options_provider, - s3_credentials_refresh_offset_seconds, ) .await?; Ok(Self { @@ -442,25 +445,16 @@ impl LanceFileSession { #[pymethods] impl LanceFileSession { #[new] - #[pyo3(signature=(uri_or_path, storage_options=None, storage_options_provider=None, s3_credentials_refresh_offset_seconds=None))] + #[pyo3(signature=(uri_or_path, storage_options=None, storage_options_provider=None))] pub fn new( uri_or_path: String, storage_options: Option>, - storage_options_provider: Option, - s3_credentials_refresh_offset_seconds: Option, + storage_options_provider: Option<&Bound<'_, PyAny>>, ) -> PyResult { let provider = storage_options_provider .map(crate::storage_options::py_object_to_storage_options_provider) .transpose()?; - rt().block_on( - None, - Self::try_new( - uri_or_path, - storage_options, - provider, - s3_credentials_refresh_offset_seconds, - ), - )? + rt().block_on(None, Self::try_new(uri_or_path, storage_options, provider))? } #[pyo3(signature=(path, columns=None))] @@ -642,14 +636,12 @@ impl LanceFileReader { uri_or_path: String, storage_options: Option>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, columns: Option>, ) -> PyResult { let (object_store, path) = object_store_from_uri_or_path_with_provider( uri_or_path, storage_options, storage_options_provider, - s3_credentials_refresh_offset_seconds, ) .await?; Self::open_with_store(object_store, path, columns).await @@ -747,27 +739,17 @@ impl LanceFileReader { #[pymethods] impl LanceFileReader { #[new] - #[pyo3(signature=(path, storage_options=None, storage_options_provider=None, s3_credentials_refresh_offset_seconds=None, columns=None))] + #[pyo3(signature=(path, storage_options=None, storage_options_provider=None, columns=None))] pub fn new( path: String, storage_options: Option>, - storage_options_provider: Option, - s3_credentials_refresh_offset_seconds: Option, + storage_options_provider: Option<&Bound<'_, PyAny>>, columns: Option>, ) -> PyResult { let provider = storage_options_provider .map(crate::storage_options::py_object_to_storage_options_provider) .transpose()?; - rt().block_on( - None, - Self::open( - path, - storage_options, - provider, - s3_credentials_refresh_offset_seconds, - columns, - ), - )? + rt().block_on(None, Self::open(path, storage_options, provider, columns))? } pub fn read_all( diff --git a/python/src/lib.rs b/python/src/lib.rs index faf62eb546c..1512a8deef8 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -273,6 +273,7 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; #[cfg(feature = "rest-adapter")] m.add_class::()?; + m.add_class::()?; m.add_wrapped(wrap_pyfunction!(bfloat16_array))?; m.add_wrapped(wrap_pyfunction!(write_dataset))?; m.add_wrapped(wrap_pyfunction!(write_fragments))?; diff --git a/python/src/namespace.rs b/python/src/namespace.rs index 4ddf0fc76a4..53d180f9cc6 100644 --- a/python/src/namespace.rs +++ b/python/src/namespace.rs @@ -7,11 +7,11 @@ use std::collections::HashMap; use std::sync::Arc; use bytes::Bytes; -use lance_namespace_impls::DirectoryNamespaceBuilder; #[cfg(feature = "rest")] use lance_namespace_impls::RestNamespaceBuilder; #[cfg(feature = "rest-adapter")] use lance_namespace_impls::{ConnectBuilder, RestAdapter, RestAdapterConfig, RestAdapterHandle}; +use lance_namespace_impls::{DirectoryNamespaceBuilder, DynamicContextProvider, OperationInfo}; use pyo3::prelude::*; use pyo3::types::{PyBytes, PyDict}; use pythonize::{depythonize, pythonize}; @@ -19,6 +19,73 @@ use pythonize::{depythonize, pythonize}; use crate::error::PythonErrorExt; use crate::session::Session; +/// Python-implemented dynamic context provider. +/// +/// Wraps a Python object that has a `provide_context(info: dict) -> dict` method. +/// For RestNamespace, context keys that start with `headers.` are converted to +/// HTTP headers by stripping the prefix. +pub struct PyDynamicContextProvider { + provider: Py, +} + +impl Clone for PyDynamicContextProvider { + fn clone(&self) -> Self { + Python::with_gil(|py| Self { + provider: self.provider.clone_ref(py), + }) + } +} + +impl PyDynamicContextProvider { + /// Create a new Python context provider wrapper. + pub fn new(provider: Py) -> Self { + Self { provider } + } +} + +impl std::fmt::Debug for PyDynamicContextProvider { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "PyDynamicContextProvider") + } +} + +impl DynamicContextProvider for PyDynamicContextProvider { + fn provide_context(&self, info: &OperationInfo) -> HashMap { + Python::with_gil(|py| { + // Create Python dict for operation info + let py_info = PyDict::new(py); + if py_info.set_item("operation", &info.operation).is_err() { + return HashMap::new(); + } + if py_info.set_item("object_id", &info.object_id).is_err() { + return HashMap::new(); + } + + // Call the provider's provide_context method + let result = self + .provider + .call_method1(py, "provide_context", (py_info,)); + + match result { + Ok(headers_py) => { + // Convert Python dict to Rust HashMap + let bound_headers = headers_py.bind(py); + if let Ok(dict) = bound_headers.downcast::() { + dict_to_hashmap(dict).unwrap_or_default() + } else { + log::warn!("Context provider did not return a dict"); + HashMap::new() + } + } + Err(e) => { + log::error!("Failed to call context provider: {}", e); + HashMap::new() + } + } + }) + } +} + /// Convert Python dict to HashMap fn dict_to_hashmap(dict: &Bound<'_, PyDict>) -> PyResult> { let mut map = HashMap::new(); @@ -39,10 +106,18 @@ pub struct PyDirectoryNamespace { #[pymethods] impl PyDirectoryNamespace { /// Create a new DirectoryNamespace from properties + /// + /// # Arguments + /// + /// * `session` - Optional Lance session for sharing storage connections + /// * `context_provider` - Optional object with `provide_context(info: dict) -> dict` method + /// for providing dynamic per-request context + /// * `**properties` - Namespace configuration properties #[new] - #[pyo3(signature = (session = None, **properties))] + #[pyo3(signature = (session = None, context_provider = None, **properties))] fn new( session: Option<&Bound<'_, Session>>, + context_provider: Option<&Bound<'_, PyAny>>, properties: Option<&Bound<'_, PyDict>>, ) -> PyResult { let mut props = HashMap::new(); @@ -53,7 +128,7 @@ impl PyDirectoryNamespace { let session_arc = session.map(|s| s.borrow().inner.clone()); - let builder = + let mut builder = DirectoryNamespaceBuilder::from_properties(props, session_arc).map_err(|e| { pyo3::exceptions::PyValueError::new_err(format!( "Failed to create DirectoryNamespace: {}", @@ -61,6 +136,12 @@ impl PyDirectoryNamespace { )) })?; + // Add context provider if provided + if let Some(provider) = context_provider { + let py_provider = PyDynamicContextProvider::new(provider.clone().unbind()); + builder = builder.context_provider(Arc::new(py_provider)); + } + let namespace = crate::rt().block_on(None, builder.build())?.infer_error()?; Ok(Self { @@ -183,6 +264,7 @@ impl PyDirectoryNamespace { Ok(pythonize(py, &response)?.into()) } + #[allow(deprecated)] fn create_empty_table(&self, py: Python, request: &Bound<'_, PyAny>) -> PyResult { let request = depythonize(request)?; let response = crate::rt() @@ -190,6 +272,14 @@ impl PyDirectoryNamespace { .infer_error()?; Ok(pythonize(py, &response)?.into()) } + + fn declare_table(&self, py: Python, request: &Bound<'_, PyAny>) -> PyResult { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.declare_table(request))? + .infer_error()?; + Ok(pythonize(py, &response)?.into()) + } } #[cfg(feature = "rest")] @@ -203,22 +293,39 @@ pub struct PyRestNamespace { #[pymethods] impl PyRestNamespace { /// Create a new RestNamespace from properties + /// + /// # Arguments + /// + /// * `context_provider` - Optional object with `provide_context(info: dict) -> dict` method + /// for providing dynamic per-request context. Context keys that start with `headers.` + /// are converted to HTTP headers by stripping the prefix. For example, + /// `{"headers.Authorization": "Bearer token"}` becomes the `Authorization` header. + /// * `**properties` - Namespace configuration properties (uri, delimiter, header.*, etc.) #[new] - #[pyo3(signature = (**properties))] - fn new(properties: Option<&Bound<'_, PyDict>>) -> PyResult { + #[pyo3(signature = (context_provider = None, **properties))] + fn new( + context_provider: Option<&Bound<'_, PyAny>>, + properties: Option<&Bound<'_, PyDict>>, + ) -> PyResult { let mut props = HashMap::new(); if let Some(dict) = properties { props = dict_to_hashmap(dict)?; } - let builder = RestNamespaceBuilder::from_properties(props).map_err(|e| { + let mut builder = RestNamespaceBuilder::from_properties(props).map_err(|e| { pyo3::exceptions::PyValueError::new_err(format!( "Failed to create RestNamespace: {}", e )) })?; + // Add context provider if provided + if let Some(provider) = context_provider { + let py_provider = PyDynamicContextProvider::new(provider.clone().unbind()); + builder = builder.context_provider(Arc::new(py_provider)); + } + let namespace = builder.build(); Ok(Self { @@ -341,6 +448,7 @@ impl PyRestNamespace { Ok(pythonize(py, &response)?.into()) } + #[allow(deprecated)] fn create_empty_table(&self, py: Python, request: &Bound<'_, PyAny>) -> PyResult { let request = depythonize(request)?; let response = crate::rt() @@ -348,6 +456,14 @@ impl PyRestNamespace { .infer_error()?; Ok(pythonize(py, &response)?.into()) } + + fn declare_table(&self, py: Python, request: &Bound<'_, PyAny>) -> PyResult { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.declare_table(request))? + .infer_error()?; + Ok(pythonize(py, &response)?.into()) + } } #[cfg(feature = "rest-adapter")] diff --git a/python/src/storage_options.rs b/python/src/storage_options.rs index 3defd74f267..ba7ec4f4ec4 100644 --- a/python/src/storage_options.rs +++ b/python/src/storage_options.rs @@ -5,7 +5,7 @@ use std::collections::HashMap; use std::sync::Arc; use async_trait::async_trait; -use lance_io::object_store::StorageOptionsProvider; +use lance_io::object_store::{StorageOptionsAccessor, StorageOptionsProvider}; use pyo3::prelude::*; use pyo3::types::PyDict; @@ -162,8 +162,132 @@ impl StorageOptionsProvider for PyStorageOptionsProviderWrapper { /// Convert a Python object to an Arc /// This is the main entry point for converting Python storage options providers to Rust pub fn py_object_to_storage_options_provider( - py_obj: PyObject, + py_obj: &Bound<'_, PyAny>, ) -> PyResult> { - let py_provider = PyStorageOptionsProvider::new(py_obj)?; + let py_provider = PyStorageOptionsProvider::new(py_obj.clone().unbind())?; Ok(Arc::new(PyStorageOptionsProviderWrapper::new(py_provider))) } + +/// Python wrapper for StorageOptionsAccessor +/// +/// This wraps a Rust StorageOptionsAccessor and exposes it to Python. +#[pyclass(name = "StorageOptionsAccessor")] +#[derive(Clone)] +pub struct PyStorageOptionsAccessor { + inner: Arc, +} + +impl PyStorageOptionsAccessor { + pub fn new(accessor: Arc) -> Self { + Self { inner: accessor } + } + + pub fn inner(&self) -> Arc { + self.inner.clone() + } +} + +#[pymethods] +impl PyStorageOptionsAccessor { + /// Create an accessor with only static options (no refresh capability) + #[staticmethod] + fn with_static_options(options: HashMap) -> Self { + Self { + inner: Arc::new(StorageOptionsAccessor::with_static_options(options)), + } + } + + /// Create an accessor with a dynamic provider (no initial options) + /// + /// The refresh offset is extracted from storage options using the `refresh_offset_millis` key. + #[staticmethod] + fn with_provider(provider: &Bound<'_, PyAny>) -> PyResult { + let rust_provider = py_object_to_storage_options_provider(provider)?; + Ok(Self { + inner: Arc::new(StorageOptionsAccessor::with_provider(rust_provider)), + }) + } + + /// Create an accessor with initial options and a dynamic provider + /// + /// The refresh offset is extracted from initial_options using the `refresh_offset_millis` key. + #[staticmethod] + fn with_initial_and_provider( + initial_options: HashMap, + provider: &Bound<'_, PyAny>, + ) -> PyResult { + let rust_provider = py_object_to_storage_options_provider(provider)?; + Ok(Self { + inner: Arc::new(StorageOptionsAccessor::with_initial_and_provider( + initial_options, + rust_provider, + )), + }) + } + + /// Get current valid storage options + fn get_storage_options(&self, py: Python<'_>) -> PyResult> { + let accessor = self.inner.clone(); + let options = rt() + .block_on(Some(py), accessor.get_storage_options())? + .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e.to_string()))?; + Ok(options.0) + } + + /// Get the initial storage options without refresh + fn initial_storage_options(&self) -> Option> { + self.inner.initial_storage_options().cloned() + } + + /// Get the accessor ID for equality/hashing + fn accessor_id(&self) -> String { + self.inner.accessor_id() + } + + /// Check if this accessor has a dynamic provider + fn has_provider(&self) -> bool { + self.inner.has_provider() + } + + /// Get the refresh offset in seconds + fn refresh_offset_secs(&self) -> u64 { + self.inner.refresh_offset().as_secs() + } + + fn __repr__(&self) -> String { + format!( + "StorageOptionsAccessor(id={}, has_provider={})", + self.inner.accessor_id(), + self.inner.has_provider() + ) + } +} + +/// Create a StorageOptionsAccessor from Python parameters +/// +/// This handles the conversion from Python types to Rust StorageOptionsAccessor. +/// The refresh offset is extracted from storage_options using the `refresh_offset_millis` key. +#[allow(dead_code)] +pub fn create_accessor_from_python( + storage_options: Option>, + storage_options_provider: Option<&Bound<'_, PyAny>>, +) -> PyResult>> { + match (storage_options, storage_options_provider) { + (Some(opts), Some(provider)) => { + let rust_provider = py_object_to_storage_options_provider(provider)?; + Ok(Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(opts, rust_provider), + ))) + } + (None, Some(provider)) => { + let rust_provider = py_object_to_storage_options_provider(provider)?; + Ok(Some(Arc::new(StorageOptionsAccessor::with_provider( + rust_provider, + )))) + } + (Some(opts), None) => Ok(Some(Arc::new(StorageOptionsAccessor::with_static_options( + opts, + )))), + (None, None) => Ok(None), + } +} diff --git a/rust/lance-core/src/error.rs b/rust/lance-core/src/error.rs index 48150db4354..f80dbca4a7b 100644 --- a/rust/lance-core/src/error.rs +++ b/rust/lance-core/src/error.rs @@ -184,7 +184,7 @@ impl LanceOptionExt for Option { } } -trait ToSnafuLocation { +pub trait ToSnafuLocation { fn to_snafu_location(&'static self) -> snafu::Location; } diff --git a/rust/lance-datafusion/src/exec.rs b/rust/lance-datafusion/src/exec.rs index 1bac7466700..8cf238f90bc 100644 --- a/rust/lance-datafusion/src/exec.rs +++ b/rust/lance-datafusion/src/exec.rs @@ -6,7 +6,7 @@ use std::{ collections::HashMap, fmt::{self, Formatter}, - sync::{Arc, LazyLock, Mutex}, + sync::{Arc, Mutex, OnceLock}, time::Duration, }; @@ -359,26 +359,78 @@ pub fn new_session_context(options: &LanceExecutionOptions) -> SessionContext { ctx } -static DEFAULT_SESSION_CONTEXT: LazyLock = - LazyLock::new(|| new_session_context(&LanceExecutionOptions::default())); +/// Cache key for session contexts based on resolved configuration values. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +struct SessionContextCacheKey { + mem_pool_size: u64, + target_partition: Option, + use_spilling: bool, +} + +impl SessionContextCacheKey { + fn from_options(options: &LanceExecutionOptions) -> Self { + Self { + mem_pool_size: options.mem_pool_size(), + target_partition: options.target_partition, + use_spilling: options.use_spilling(), + } + } +} -static DEFAULT_SESSION_CONTEXT_WITH_SPILLING: LazyLock = LazyLock::new(|| { - new_session_context(&LanceExecutionOptions { - use_spilling: true, - ..Default::default() +struct CachedSessionContext { + context: SessionContext, + last_access: std::time::Instant, +} + +fn get_session_cache() -> &'static Mutex> { + static SESSION_CACHE: OnceLock>> = + OnceLock::new(); + SESSION_CACHE.get_or_init(|| Mutex::new(HashMap::new())) +} + +fn get_max_cache_size() -> usize { + const DEFAULT_CACHE_SIZE: usize = 4; + static MAX_CACHE_SIZE: OnceLock = OnceLock::new(); + *MAX_CACHE_SIZE.get_or_init(|| { + std::env::var("LANCE_SESSION_CACHE_SIZE") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(DEFAULT_CACHE_SIZE) }) -}); +} pub fn get_session_context(options: &LanceExecutionOptions) -> SessionContext { - if options.mem_pool_size() == DEFAULT_LANCE_MEM_POOL_SIZE && options.target_partition.is_none() - { - return if options.use_spilling() { - DEFAULT_SESSION_CONTEXT_WITH_SPILLING.clone() - } else { - DEFAULT_SESSION_CONTEXT.clone() - }; + let key = SessionContextCacheKey::from_options(options); + let mut cache = get_session_cache() + .lock() + .unwrap_or_else(|e| e.into_inner()); + + // If key exists, update access time and return + if let Some(entry) = cache.get_mut(&key) { + entry.last_access = std::time::Instant::now(); + return entry.context.clone(); + } + + // Evict least recently used entry if cache is full + if cache.len() >= get_max_cache_size() { + if let Some(lru_key) = cache + .iter() + .min_by_key(|(_, v)| v.last_access) + .map(|(k, _)| k.clone()) + { + cache.remove(&lru_key); + } } - new_session_context(options) + + let context = new_session_context(options); + cache.insert( + key, + CachedSessionContext { + context: context.clone(), + last_access: std::time::Instant::now(), + }, + ); + context } fn get_task_context( @@ -791,3 +843,111 @@ impl ExecutionPlan for StrictBatchSizeExec { true } } + +#[cfg(test)] +mod tests { + use super::*; + + // Serialize cache tests since they share global state + static CACHE_TEST_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); + + #[test] + fn test_session_context_cache() { + let _lock = CACHE_TEST_LOCK.lock().unwrap(); + let cache = get_session_cache(); + + // Clear any existing entries from other tests + cache.lock().unwrap().clear(); + + // Create first session with default options + let opts1 = LanceExecutionOptions::default(); + let _ctx1 = get_session_context(&opts1); + + { + let cache_guard = cache.lock().unwrap(); + assert_eq!(cache_guard.len(), 1); + } + + // Same options should reuse cached session (no new entry) + let _ctx1_again = get_session_context(&opts1); + { + let cache_guard = cache.lock().unwrap(); + assert_eq!(cache_guard.len(), 1); + } + + // Different options should create new entry + let opts2 = LanceExecutionOptions { + use_spilling: true, + ..Default::default() + }; + let _ctx2 = get_session_context(&opts2); + { + let cache_guard = cache.lock().unwrap(); + assert_eq!(cache_guard.len(), 2); + } + } + + #[test] + fn test_session_context_cache_lru_eviction() { + let _lock = CACHE_TEST_LOCK.lock().unwrap(); + let cache = get_session_cache(); + + // Clear any existing entries from other tests + cache.lock().unwrap().clear(); + + // Create 4 different configurations to fill the cache + let configs: Vec = (0..4) + .map(|i| LanceExecutionOptions { + mem_pool_size: Some((i + 1) as u64 * 1024 * 1024), + ..Default::default() + }) + .collect(); + + for config in &configs { + let _ctx = get_session_context(config); + } + + { + let cache_guard = cache.lock().unwrap(); + assert_eq!(cache_guard.len(), 4); + } + + // Access config[0] to make it more recently used than config[1] + // (config[0] was inserted first, so without this access it would be evicted) + std::thread::sleep(std::time::Duration::from_millis(1)); + let _ctx = get_session_context(&configs[0]); + + // Add a 5th configuration - should evict config[1] (now least recently used) + let opts5 = LanceExecutionOptions { + mem_pool_size: Some(5 * 1024 * 1024), + ..Default::default() + }; + let _ctx5 = get_session_context(&opts5); + + { + let cache_guard = cache.lock().unwrap(); + assert_eq!(cache_guard.len(), 4); + + // config[0] should still be present (was accessed recently) + let key0 = SessionContextCacheKey::from_options(&configs[0]); + assert!( + cache_guard.contains_key(&key0), + "config[0] should still be cached after recent access" + ); + + // config[1] should be evicted (was least recently used) + let key1 = SessionContextCacheKey::from_options(&configs[1]); + assert!( + !cache_guard.contains_key(&key1), + "config[1] should have been evicted" + ); + + // New config should be present + let key5 = SessionContextCacheKey::from_options(&opts5); + assert!( + cache_guard.contains_key(&key5), + "new config should be cached" + ); + } + } +} diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index ecfb93679cb..25c756950f3 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -148,7 +148,7 @@ impl PostingIterator { num_doc: usize, ) -> Self { let approximate_upper_bound = match list.max_score() { - Some(max_score) => max_score, // the index doesn't include the full BM25 upper bound at indexing time, so we need to multiply it here + Some(max_score) => max_score, None => idf(list.len(), num_doc) * (K1 + 1.0), }; @@ -265,7 +265,7 @@ impl PostingIterator { #[inline] fn block_max_score(&self) -> f32 { match self.list { - PostingList::Compressed(ref list) => list.block_max_score(self.block_idx) * (K1 + 1.0), + PostingList::Compressed(ref list) => list.block_max_score(self.block_idx), PostingList::Plain(_) => self.approximate_upper_bound, } } @@ -978,4 +978,23 @@ mod tests { assert!(result.is_ok()); } + + #[test] + fn test_block_max_score_matches_stored_value() { + let doc_ids = vec![0_u32]; + let block_max_scores = vec![0.7_f32]; + let posting_list = generate_posting_list(doc_ids, 0.7, Some(block_max_scores), true); + let expected = match &posting_list { + PostingList::Compressed(list) => list.block_max_score(0), + PostingList::Plain(_) => unreachable!("expected compressed posting list"), + }; + + let posting = PostingIterator::new(String::from("test"), 0, 0, posting_list, 1); + + let actual = posting.block_max_score(); + assert!( + (actual - expected).abs() < 1e-6, + "block max score should match stored value" + ); + } } diff --git a/rust/lance-io/src/object_store.rs b/rust/lance-io/src/object_store.rs index 4375a950d09..b941a57b4fd 100644 --- a/rust/lance-io/src/object_store.rs +++ b/rust/lance-io/src/object_store.rs @@ -64,7 +64,8 @@ pub const DEFAULT_DOWNLOAD_RETRY_COUNT: usize = 3; pub use providers::{ObjectStoreProvider, ObjectStoreRegistry}; pub use storage_options::{ - LanceNamespaceStorageOptionsProvider, StorageOptionsProvider, EXPIRES_AT_MILLIS_KEY, + LanceNamespaceStorageOptionsProvider, StorageOptionsAccessor, StorageOptionsProvider, + EXPIRES_AT_MILLIS_KEY, REFRESH_OFFSET_MILLIS_KEY, }; #[async_trait] @@ -127,6 +128,10 @@ pub struct ObjectStore { download_retry_count: usize, /// IO tracker for monitoring read/write operations io_tracker: IOTracker, + /// The datastore prefix that uniquely identifies this object store. It encodes information + /// which usually cannot be found in the URL such as Azure account name. The prefix plus the + /// path uniquely identifies any object inside the store. + pub store_prefix: String, } impl DeepSizeOf for ObjectStore { @@ -183,13 +188,18 @@ pub struct ObjectStoreParams { pub block_size: Option, #[deprecated(note = "Implement an ObjectStoreProvider instead")] pub object_store: Option<(Arc, Url)>, + /// Refresh offset for AWS credentials when using the legacy AWS credentials path. + /// For StorageOptionsAccessor, use `refresh_offset_millis` storage option instead. pub s3_credentials_refresh_offset: Duration, #[cfg(feature = "aws")] pub aws_credentials: Option, pub object_store_wrapper: Option>, - pub storage_options: Option>, - /// Dynamic storage options provider for automatic credential refresh - pub storage_options_provider: Option>, + /// Unified storage options accessor with caching and automatic refresh + /// + /// Provides storage options and optionally a dynamic provider for automatic + /// credential refresh. Use `StorageOptionsAccessor::with_static_options()` for static + /// options or `StorageOptionsAccessor::with_initial_and_provider()` for dynamic refresh. + pub storage_options_accessor: Option>, /// Use constant size upload parts for multipart uploads. Only necessary /// for Cloudflare R2, which doesn't support variable size parts. When this /// is false, max upload size is 2.5TB. When this is true, the max size is @@ -208,19 +218,34 @@ impl Default for ObjectStoreParams { #[cfg(feature = "aws")] aws_credentials: None, object_store_wrapper: None, - storage_options: None, - storage_options_provider: None, + storage_options_accessor: None, use_constant_size_upload_parts: false, list_is_lexically_ordered: None, } } } +impl ObjectStoreParams { + /// Get the StorageOptionsAccessor from the params + pub fn get_accessor(&self) -> Option> { + self.storage_options_accessor.clone() + } + + /// Get storage options from the accessor, if any + /// + /// Returns the initial storage options from the accessor without triggering refresh. + pub fn storage_options(&self) -> Option<&HashMap> { + self.storage_options_accessor + .as_ref() + .and_then(|a| a.initial_storage_options()) + } +} + // We implement hash for caching impl std::hash::Hash for ObjectStoreParams { #[allow(deprecated)] fn hash(&self, state: &mut H) { - // For hashing, we use pointer values for ObjectStore, S3 credentials, wrapper, and storage options provider + // For hashing, we use pointer values for ObjectStore, S3 credentials, wrapper self.block_size.hash(state); if let Some((store, url)) = &self.object_store { Arc::as_ptr(store).hash(state); @@ -234,14 +259,8 @@ impl std::hash::Hash for ObjectStoreParams { if let Some(wrapper) = &self.object_store_wrapper { Arc::as_ptr(wrapper).hash(state); } - if let Some(storage_options) = &self.storage_options { - for (key, value) in storage_options { - key.hash(state); - value.hash(state); - } - } - if let Some(provider) = &self.storage_options_provider { - provider.provider_id().hash(state); + if let Some(accessor) = &self.storage_options_accessor { + accessor.accessor_id().hash(state); } self.use_constant_size_upload_parts.hash(state); self.list_is_lexically_ordered.hash(state); @@ -259,7 +278,7 @@ impl PartialEq for ObjectStoreParams { } // For equality, we use pointer comparison for ObjectStore, S3 credentials, wrapper - // For storage_options_provider, we use provider_id() for semantic equality + // For accessor, we use accessor_id() for semantic equality self.block_size == other.block_size && self .object_store @@ -272,15 +291,14 @@ impl PartialEq for ObjectStoreParams { && self.s3_credentials_refresh_offset == other.s3_credentials_refresh_offset && self.object_store_wrapper.as_ref().map(Arc::as_ptr) == other.object_store_wrapper.as_ref().map(Arc::as_ptr) - && self.storage_options == other.storage_options && self - .storage_options_provider + .storage_options_accessor .as_ref() - .map(|p| p.provider_id()) + .map(|a| a.accessor_id()) == other - .storage_options_provider + .storage_options_accessor .as_ref() - .map(|p| p.provider_id()) + .map(|a| a.accessor_id()) && self.use_constant_size_upload_parts == other.use_constant_size_upload_parts && self.list_is_lexically_ordered == other.list_is_lexically_ordered } @@ -410,7 +428,7 @@ impl ObjectStore { if let Some((store, path)) = params.object_store.as_ref() { let mut inner = store.clone(); let store_prefix = - registry.calculate_object_store_prefix(uri, params.storage_options.as_ref())?; + registry.calculate_object_store_prefix(uri, params.storage_options())?; if let Some(wrapper) = params.object_store_wrapper.as_ref() { inner = wrapper.wrap(&store_prefix, inner); } @@ -429,6 +447,7 @@ impl ObjectStore { io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, download_retry_count: DEFAULT_DOWNLOAD_RETRY_COUNT, io_tracker, + store_prefix: String::new(), // custom object store, no prefix needed }; let path = Path::parse(path.path())?; return Ok((Arc::new(store), path)); @@ -859,13 +878,12 @@ impl ObjectStore { let scheme = location.scheme(); let block_size = block_size.unwrap_or_else(|| infer_block_size(scheme)); + let store_prefix = DEFAULT_OBJECT_STORE_REGISTRY + .calculate_object_store_prefix(location.as_ref(), storage_options) + .unwrap_or_default(); + let store = match wrapper { - Some(wrapper) => { - let store_prefix = DEFAULT_OBJECT_STORE_REGISTRY - .calculate_object_store_prefix(location.as_ref(), storage_options) - .unwrap(); - wrapper.wrap(&store_prefix, store) - } + Some(wrapper) => wrapper.wrap(&store_prefix, store), None => store, }; @@ -883,6 +901,7 @@ impl ObjectStore { io_parallelism, download_retry_count, io_tracker, + store_prefix, } } } @@ -974,8 +993,11 @@ mod tests { ) { // Test the default let registry = Arc::new(ObjectStoreRegistry::default()); + let accessor = storage_options + .clone() + .map(|opts| Arc::new(StorageOptionsAccessor::with_static_options(opts))); let params = ObjectStoreParams { - storage_options: storage_options.clone(), + storage_options_accessor: accessor.clone(), ..ObjectStoreParams::default() }; let (store, _) = ObjectStore::from_uri_and_params(registry, uri, ¶ms) @@ -987,7 +1009,7 @@ mod tests { let registry = Arc::new(ObjectStoreRegistry::default()); let params = ObjectStoreParams { block_size: Some(1024), - storage_options: storage_options.clone(), + storage_options_accessor: accessor, ..ObjectStoreParams::default() }; let (store, _) = ObjectStore::from_uri_and_params(registry, uri, ¶ms) diff --git a/rust/lance-io/src/object_store/providers.rs b/rust/lance-io/src/object_store/providers.rs index 17cbb3900d2..032c979c134 100644 --- a/rust/lance-io/src/object_store/providers.rs +++ b/rust/lance-io/src/object_store/providers.rs @@ -172,7 +172,7 @@ impl ObjectStoreRegistry { }; let cache_path = - provider.calculate_object_store_prefix(&base_path, params.storage_options.as_ref())?; + provider.calculate_object_store_prefix(&base_path, params.storage_options())?; let cache_key = (cache_path.clone(), params.clone()); // Check if we have a cached store for this base path and params diff --git a/rust/lance-io/src/object_store/providers/aws.rs b/rust/lance-io/src/object_store/providers/aws.rs index 9bd93bf029a..982470581f2 100644 --- a/rust/lance-io/src/object_store/providers/aws.rs +++ b/rust/lance-io/src/object_store/providers/aws.rs @@ -28,8 +28,9 @@ use tokio::sync::RwLock; use url::Url; use crate::object_store::{ - ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptions, StorageOptionsProvider, - DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, + ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptions, StorageOptionsAccessor, + StorageOptionsProvider, DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, + DEFAULT_MAX_IOP_SIZE, }; use lance_core::error::{Error, Result}; @@ -54,13 +55,16 @@ impl AwsStoreProvider { let mut s3_storage_options = storage_options.as_s3_options(); let region = resolve_s3_region(base_path, &s3_storage_options).await?; + + // Get accessor from params + let accessor = params.get_accessor(); + let (aws_creds, region) = build_aws_credential( params.s3_credentials_refresh_offset, params.aws_credentials.clone(), Some(&s3_storage_options), region, - params.storage_options_provider.clone(), - storage_options.expires_at_millis(), + accessor, ) .await?; @@ -132,7 +136,7 @@ impl ObjectStoreProvider for AwsStoreProvider { ) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); let mut storage_options = - StorageOptions(params.storage_options.clone().unwrap_or_default()); + StorageOptions(params.storage_options().cloned().unwrap_or_default()); storage_options.with_env_s3(); let download_retry_count = storage_options.download_retry_count(); @@ -171,6 +175,8 @@ impl ObjectStoreProvider for AwsStoreProvider { io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, download_retry_count, io_tracker: Default::default(), + store_prefix: self + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } } @@ -226,20 +232,17 @@ async fn resolve_s3_region( /// Build AWS credentials /// /// This resolves credentials from the following sources in order: -/// 1. An explicit `storage_options_provider` +/// 1. An explicit `storage_options_accessor` with a provider /// 2. An explicit `credentials` provider /// 3. Explicit credentials in storage_options (as in `aws_access_key_id`, /// `aws_secret_access_key`, `aws_session_token`) /// 4. The default credential provider chain from AWS SDK. /// -/// # Initial Credentials with Storage Options Provider +/// # Storage Options Accessor /// -/// When `storage_options_provider` is provided along with `storage_options` and -/// `expires_at_millis`, these serve as **initial values** to avoid redundant calls to -/// fetch new storage options. The provider will use these initial credentials until they -/// expire (based on `expires_at_millis`), then automatically fetch fresh credentials from -/// the provider. Once the initial credentials expire, the passed-in values are no longer -/// used - all future credentials come from the provider's `fetch_storage_options()` method. +/// When `storage_options_accessor` is provided and has a dynamic provider, +/// credentials are fetched and cached by the accessor with automatic refresh +/// before expiration. /// /// `credentials_refresh_offset` is the amount of time before expiry to refresh credentials. pub async fn build_aws_credential( @@ -247,10 +250,8 @@ pub async fn build_aws_credential( credentials: Option, storage_options: Option<&HashMap>, region: Option, - storage_options_provider: Option>, - expires_at_millis: Option, + storage_options_accessor: Option>, ) -> Result<(AwsCredentialProvider, String)> { - // TODO: make this return no credential provider not using AWS use aws_config::meta::region::RegionProviderChain; const DEFAULT_REGION: &str = "us-west-2"; @@ -266,17 +267,24 @@ pub async fn build_aws_credential( }; let storage_options_credentials = storage_options.and_then(extract_static_s3_credentials); - if let Some(storage_options_provider) = storage_options_provider { - let creds = build_aws_credential_with_storage_options_provider( - storage_options_provider, - credentials_refresh_offset, - credentials, - storage_options_credentials, - expires_at_millis, - ) - .await?; - Ok((creds, region)) - } else if let Some(creds) = credentials { + + // If accessor has a provider, use DynamicStorageOptionsCredentialProvider + if let Some(accessor) = storage_options_accessor { + if accessor.has_provider() { + // Explicit aws_credentials takes precedence + if let Some(creds) = credentials { + return Ok((creds, region)); + } + // Use accessor for dynamic credential refresh + return Ok(( + Arc::new(DynamicStorageOptionsCredentialProvider::new(accessor)), + region, + )); + } + } + + // Fall back to existing logic for static credentials + if let Some(creds) = credentials { Ok((creds, region)) } else if let Some(creds) = storage_options_credentials { Ok((Arc::new(creds), region)) @@ -293,58 +301,6 @@ pub async fn build_aws_credential( } } -async fn build_aws_credential_with_storage_options_provider( - storage_options_provider: Arc, - credentials_refresh_offset: Duration, - credentials: Option, - storage_options_credentials: Option>, - expires_at_millis: Option, -) -> Result { - match (expires_at_millis, credentials, storage_options_credentials) { - // Case 1: provider + credentials + expiration time - (Some(expires_at), Some(cred), _) => { - Ok(Arc::new( - DynamicStorageOptionsCredentialProvider::new_with_initial_credential( - storage_options_provider, - credentials_refresh_offset, - cred.get_credential().await?, - expires_at, - ), - )) - } - // Case 2: provider + storage_options (with valid credentials) + expiration time - (Some(expires_at), None, Some(cred)) => { - Ok(Arc::new( - DynamicStorageOptionsCredentialProvider::new_with_initial_credential( - storage_options_provider, - credentials_refresh_offset, - cred.get_credential().await?, - expires_at, - ), - )) - } - // Case 3: provider + storage_options without expiration - FAIL - (None, None, Some(_)) => Err(Error::IO { - source: Box::new(std::io::Error::other( - "expires_at_millis is required when using storage_options_provider with storage_options", - )), - location: location!(), - }), - // Case 4: provider + credentials without expiration - FAIL - (None, Some(_), _) => Err(Error::IO { - source: Box::new(std::io::Error::other( - "expires_at_millis is required when using storage_options_provider with credentials", - )), - location: location!(), - }), - // Case 5: provider without credentials/storage_options, or with expiration but no creds/opts - (_, None, None) => Ok(Arc::new(DynamicStorageOptionsCredentialProvider::new( - storage_options_provider, - credentials_refresh_offset, - ))), - } -} - fn extract_static_s3_credentials( options: &HashMap, ) -> Option> { @@ -487,20 +443,24 @@ impl ObjectStoreParams { aws_credentials: Option, region: Option, ) -> Self { + let storage_options_accessor = region.map(|region| { + let opts: HashMap = + [("region".into(), region)].iter().cloned().collect(); + Arc::new(StorageOptionsAccessor::with_static_options(opts)) + }); Self { aws_credentials, - storage_options: region - .map(|region| [("region".into(), region)].iter().cloned().collect()), + storage_options_accessor, ..Default::default() } } } -/// AWS Credential Provider that uses StorageOptionsProvider +/// AWS Credential Provider that delegates to StorageOptionsAccessor /// -/// This adapter converts our generic StorageOptionsProvider trait into -/// AWS-specific credentials that can be used with S3. It caches credentials -/// and automatically refreshes them before they expire. +/// This adapter converts storage options from a [`StorageOptionsAccessor`] into +/// AWS-specific credentials that can be used with S3. All caching and refresh logic +/// is handled by the accessor. /// /// # Future Work /// @@ -510,128 +470,71 @@ impl ObjectStoreParams { /// /// See: pub struct DynamicStorageOptionsCredentialProvider { - provider: Arc, - cache: Arc>>, - refresh_offset: Duration, + accessor: Arc, } impl fmt::Debug for DynamicStorageOptionsCredentialProvider { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("DynamicStorageOptionsCredentialProvider") - .field("provider", &self.provider) - .field("refresh_offset", &self.refresh_offset) + .field("accessor", &self.accessor) .finish() } } -#[derive(Debug, Clone)] -struct CachedCredential { - credential: Arc, - expires_at_millis: Option, -} - impl DynamicStorageOptionsCredentialProvider { - /// Create a new credential provider without initial credentials + /// Create a new credential provider from a storage options accessor + pub fn new(accessor: Arc) -> Self { + Self { accessor } + } + + /// Create a new credential provider from a storage options provider + /// + /// This is a convenience constructor for backward compatibility. + /// The refresh offset will be extracted from storage options using + /// the `refresh_offset_millis` key, defaulting to 60 seconds. /// /// # Arguments /// * `provider` - The storage options provider - /// * `refresh_offset` - Duration before expiry to refresh credentials - pub fn new(provider: Arc, refresh_offset: Duration) -> Self { + pub fn from_provider(provider: Arc) -> Self { Self { - provider, - cache: Arc::new(RwLock::new(None)), - refresh_offset, + accessor: Arc::new(StorageOptionsAccessor::with_provider(provider)), } } - /// Create a new credential provider with initial credentials from an explicit credential + /// Create a new credential provider with initial credentials + /// + /// This is a convenience constructor for backward compatibility. + /// The refresh offset will be extracted from initial_options using + /// the `refresh_offset_millis` key, defaulting to 60 seconds. /// /// # Arguments /// * `provider` - The storage options provider - /// * `refresh_offset` - Duration before expiry to refresh credentials - /// * `credential` - Initial credential to cache - /// * `expires_at_millis` - Expiration time in milliseconds since epoch (required for refresh) - pub fn new_with_initial_credential( + /// * `initial_options` - Initial storage options to cache + pub fn from_provider_with_initial( provider: Arc, - refresh_offset: Duration, - credential: Arc, - expires_at_millis: u64, + initial_options: HashMap, ) -> Self { Self { - provider, - cache: Arc::new(RwLock::new(Some(CachedCredential { - credential, - expires_at_millis: Some(expires_at_millis), - }))), - refresh_offset, - } - } - - fn needs_refresh(&self, cached: &Option) -> bool { - match cached { - None => true, - Some(cached_cred) => { - if let Some(expires_at_millis) = cached_cred.expires_at_millis { - let now_ms = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap_or(Duration::from_secs(0)) - .as_millis() as u64; - - // Refresh if we're within the refresh offset of expiration - let refresh_offset_millis = self.refresh_offset.as_millis() as u64; - now_ms + refresh_offset_millis >= expires_at_millis - } else { - // No expiration means credentials never expire - false - } - } + accessor: Arc::new(StorageOptionsAccessor::with_initial_and_provider( + initial_options, + provider, + )), } } +} - async fn do_get_credential(&self) -> ObjectStoreResult>> { - // Check if we have valid cached credentials with read lock - { - let cached = self.cache.read().await; - if !self.needs_refresh(&cached) { - if let Some(cached_cred) = &*cached { - return Ok(Some(cached_cred.credential.clone())); - } - } - } - - // Try to acquire write lock - if it fails, return None and let caller retry - let Ok(mut cache) = self.cache.try_write() else { - return Ok(None); - }; - - // Double-check if credentials are still stale after acquiring write lock - // (another thread might have refreshed them) - if !self.needs_refresh(&cache) { - if let Some(cached_cred) = &*cache { - return Ok(Some(cached_cred.credential.clone())); - } - } - - log::debug!( - "Refreshing S3 credentials from storage options provider: {}", - self.provider.provider_id() - ); +#[async_trait::async_trait] +impl CredentialProvider for DynamicStorageOptionsCredentialProvider { + type Credential = ObjectStoreAwsCredential; - let storage_options_map = self - .provider - .fetch_storage_options() - .await - .map_err(|e| object_store::Error::Generic { + async fn get_credential(&self) -> ObjectStoreResult> { + let storage_options = self.accessor.get_storage_options().await.map_err(|e| { + object_store::Error::Generic { store: "DynamicStorageOptionsCredentialProvider", source: Box::new(e), - })? - .ok_or_else(|| object_store::Error::Generic { - store: "DynamicStorageOptionsCredentialProvider", - source: "No storage options available".into(), - })?; + } + })?; - let storage_options = StorageOptions(storage_options_map); - let expires_at_millis = storage_options.expires_at_millis(); let s3_options = storage_options.as_s3_options(); let static_creds = extract_static_s3_credentials(&s3_options).ok_or_else(|| { object_store::Error::Generic { @@ -640,58 +543,13 @@ impl DynamicStorageOptionsCredentialProvider { } })?; - let credential = - static_creds - .get_credential() - .await - .map_err(|e| object_store::Error::Generic { - store: "DynamicStorageOptionsCredentialProvider", - source: Box::new(e), - })?; - - if let Some(expires_at) = expires_at_millis { - let now_ms = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap_or(Duration::from_secs(0)) - .as_millis() as u64; - let expires_in_secs = (expires_at.saturating_sub(now_ms)) / 1000; - log::debug!( - "Successfully refreshed S3 credentials from provider: {}, credentials expire in {} seconds", - self.provider.provider_id(), - expires_in_secs - ); - } else { - log::debug!( - "Successfully refreshed S3 credentials from provider: {} (no expiration)", - self.provider.provider_id() - ); - } - - *cache = Some(CachedCredential { - credential: credential.clone(), - expires_at_millis, - }); - - Ok(Some(credential)) - } -} - -#[async_trait::async_trait] -impl CredentialProvider for DynamicStorageOptionsCredentialProvider { - type Credential = ObjectStoreAwsCredential; - - async fn get_credential(&self) -> ObjectStoreResult> { - // Retry loop - if do_get_credential returns None (lock busy), retry from the beginning - loop { - match self.do_get_credential().await? { - Some(cred) => return Ok(cred), - None => { - // Lock was busy, wait 10ms before retrying - tokio::time::sleep(Duration::from_millis(10)).await; - continue; - } - } - } + static_creds + .get_credential() + .await + .map_err(|e| object_store::Error::Generic { + store: "DynamicStorageOptionsCredentialProvider", + source: Box::new(e), + }) } } @@ -813,13 +671,16 @@ mod tests { #[tokio::test] async fn test_use_opendal_flag() { + use crate::object_store::StorageOptionsAccessor; let provider = AwsStoreProvider; let url = Url::parse("s3://test-bucket/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options: Some(HashMap::from([ - ("use_opendal".to_string(), "true".to_string()), - ("region".to_string(), "us-west-2".to_string()), - ])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( + HashMap::from([ + ("use_opendal".to_string(), "true".to_string()), + ("region".to_string(), "us-west-2".to_string()), + ]), + ))), ..Default::default() }; @@ -896,19 +757,22 @@ mod tests { 600_000, // Expires in 10 minutes ))); - // Create credential provider with initial cached credentials that expire in 10 minutes + // Create initial options with cached credentials that expire in 10 minutes let expires_at = now_ms + 600_000; // 10 minutes from now - let initial_cred = Arc::new(ObjectStoreAwsCredential { - key_id: "AKID_CACHED".to_string(), - secret_key: "SECRET_CACHED".to_string(), - token: Some("TOKEN_CACHED".to_string()), - }); + let initial_options = HashMap::from([ + ("aws_access_key_id".to_string(), "AKID_CACHED".to_string()), + ( + "aws_secret_access_key".to_string(), + "SECRET_CACHED".to_string(), + ), + ("aws_session_token".to_string(), "TOKEN_CACHED".to_string()), + ("expires_at_millis".to_string(), expires_at.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset + ]); - let provider = DynamicStorageOptionsCredentialProvider::new_with_initial_credential( + let provider = DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset - initial_cred, - expires_at, + initial_options, ); // First call should use cached credentials (not expired yet) @@ -932,19 +796,21 @@ mod tests { 600_000, // Expires in 10 minutes ))); - // Create credential provider with initial cached credentials that expired 1 second ago + // Create initial options with credentials that expired 1 second ago let expired_time = now_ms - 1_000; // 1 second ago - let initial_cred = Arc::new(ObjectStoreAwsCredential { - key_id: "AKID_EXPIRED".to_string(), - secret_key: "SECRET_EXPIRED".to_string(), - token: None, - }); + let initial_options = HashMap::from([ + ("aws_access_key_id".to_string(), "AKID_EXPIRED".to_string()), + ( + "aws_secret_access_key".to_string(), + "SECRET_EXPIRED".to_string(), + ), + ("expires_at_millis".to_string(), expired_time.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset + ]); - let provider = DynamicStorageOptionsCredentialProvider::new_with_initial_credential( + let provider = DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset - initial_cred, - expired_time, + initial_options, ); // First call should fetch new credentials because cached ones are expired @@ -961,27 +827,24 @@ mod tests { async fn test_dynamic_credential_provider_refresh_lead_time() { MockClock::set_system_time(Duration::from_secs(100_000)); - // Create a mock provider that returns credentials expiring in 4 minutes + // Create a mock provider that returns credentials expiring in 30 seconds let mock = Arc::new(MockStorageOptionsProvider::new(Some( - 240_000, // Expires in 4 minutes + 30_000, // Expires in 30 seconds ))); - // Create credential provider with 5 minute refresh offset - // This means credentials should be refreshed when they have less than 5 minutes left - let provider = DynamicStorageOptionsCredentialProvider::new( - mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset - ); + // Create credential provider with default 60 second refresh offset + // This means credentials should be refreshed when they have less than 60 seconds left + let provider = DynamicStorageOptionsCredentialProvider::from_provider(mock.clone()); // First call should fetch credentials from provider (no initial cache) - // Credentials expire in 4 minutes, which is less than our 5 minute refresh offset, + // Credentials expire in 30 seconds, which is less than our 60 second refresh offset, // so they should be considered "needs refresh" immediately let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_1"); assert_eq!(mock.get_call_count().await, 1); - // Second call should trigger refresh because credentials expire in 4 minutes - // but our refresh lead time is 5 minutes (now + 5min > expires_at) + // Second call should trigger refresh because credentials expire in 30 seconds + // but our refresh lead time is 60 seconds (now + 60sec > expires_at) // The mock will return new credentials (AKID_2) with the same expiration let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_2"); @@ -992,16 +855,13 @@ mod tests { async fn test_dynamic_credential_provider_no_initial_cache() { MockClock::set_system_time(Duration::from_secs(100_000)); - // Create a mock provider that returns credentials expiring in 10 minutes + // Create a mock provider that returns credentials expiring in 2 minutes let mock = Arc::new(MockStorageOptionsProvider::new(Some( - 600_000, // Expires in 10 minutes + 120_000, // Expires in 2 minutes ))); - // Create credential provider without initial cache - let provider = DynamicStorageOptionsCredentialProvider::new( - mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset - ); + // Create credential provider without initial cache, using default 60 second refresh offset + let provider = DynamicStorageOptionsCredentialProvider::from_provider(mock.clone()); // First call should fetch from provider (call count = 1) let cred = provider.get_credential().await.unwrap(); @@ -1010,21 +870,22 @@ mod tests { assert_eq!(cred.token, Some("TOKEN_1".to_string())); assert_eq!(mock.get_call_count().await, 1); - // Second call should use cached credentials (not expired yet) + // Second call should use cached credentials (not expired yet, still > 60 seconds remaining) let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_1"); assert_eq!(mock.get_call_count().await, 1); // Still 1, didn't fetch again - // Advance time to 6 minutes - should trigger refresh (within 5 min refresh offset) - MockClock::set_system_time(Duration::from_secs(100_000 + 360)); + // Advance time to 90 seconds - should trigger refresh (within 60 sec refresh offset) + // At this point, credentials expire in 30 seconds (< 60 sec offset) + MockClock::set_system_time(Duration::from_secs(100_000 + 90)); let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_2"); assert_eq!(cred.secret_key, "SECRET_2"); assert_eq!(cred.token, Some("TOKEN_2".to_string())); assert_eq!(mock.get_call_count().await, 2); - // Advance time to 11 minutes total - should trigger another refresh - MockClock::set_system_time(Duration::from_secs(100_000 + 660)); + // Advance time to 210 seconds total (90 + 120) - should trigger another refresh + MockClock::set_system_time(Duration::from_secs(100_000 + 210)); let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_3"); assert_eq!(cred.secret_key, "SECRET_3"); @@ -1032,7 +893,7 @@ mod tests { } #[tokio::test] - async fn test_dynamic_credential_provider_with_initial_credential() { + async fn test_dynamic_credential_provider_with_initial_options() { MockClock::set_system_time(Duration::from_secs(100_000)); let now_ms = MockClock::system_time().as_millis() as u64; @@ -1042,20 +903,23 @@ mod tests { 600_000, // Expires in 10 minutes ))); - // Create an initial credential with expiration in 10 minutes + // Create initial options with expiration in 10 minutes let expires_at = now_ms + 600_000; // 10 minutes from now - let initial_cred = Arc::new(ObjectStoreAwsCredential { - key_id: "AKID_INITIAL".to_string(), - secret_key: "SECRET_INITIAL".to_string(), - token: Some("TOKEN_INITIAL".to_string()), - }); + let initial_options = HashMap::from([ + ("aws_access_key_id".to_string(), "AKID_INITIAL".to_string()), + ( + "aws_secret_access_key".to_string(), + "SECRET_INITIAL".to_string(), + ), + ("aws_session_token".to_string(), "TOKEN_INITIAL".to_string()), + ("expires_at_millis".to_string(), expires_at.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset + ]); - // Create credential provider with initial credential and expiration - let provider = DynamicStorageOptionsCredentialProvider::new_with_initial_credential( + // Create credential provider with initial options + let provider = DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset - initial_cred, - expires_at, + initial_options, ); // First call should use the initial credential (not expired yet) @@ -1104,9 +968,8 @@ mod tests { // Create a mock provider with far future expiration let mock = Arc::new(MockStorageOptionsProvider::new(Some(9999999999999))); - let provider = Arc::new(DynamicStorageOptionsCredentialProvider::new( + let provider = Arc::new(DynamicStorageOptionsCredentialProvider::from_provider( mock.clone(), - Duration::from_secs(300), )); // Spawn 10 concurrent tasks that all try to get credentials at the same time @@ -1152,14 +1015,18 @@ mod tests { let now_ms = MockClock::system_time().as_millis() as u64; - // Create initial credentials that expired in the past (1000 seconds ago) + // Create initial options with credentials that expired in the past (1000 seconds ago) let expires_at = now_ms - 1_000_000; - - let initial_cred = Arc::new(ObjectStoreAwsCredential { - key_id: "AKID_OLD".to_string(), - secret_key: "SECRET_OLD".to_string(), - token: Some("TOKEN_OLD".to_string()), - }); + let initial_options = HashMap::from([ + ("aws_access_key_id".to_string(), "AKID_OLD".to_string()), + ( + "aws_secret_access_key".to_string(), + "SECRET_OLD".to_string(), + ), + ("aws_session_token".to_string(), "TOKEN_OLD".to_string()), + ("expires_at_millis".to_string(), expires_at.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset + ]); // Mock will return credentials expiring in 1 hour let mock = Arc::new(MockStorageOptionsProvider::new(Some( @@ -1167,11 +1034,9 @@ mod tests { ))); let provider = Arc::new( - DynamicStorageOptionsCredentialProvider::new_with_initial_credential( + DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), - Duration::from_secs(300), - initial_cred, - expires_at, + initial_options, ), ); @@ -1217,4 +1082,112 @@ mod tests { call_count ); } + + #[tokio::test] + async fn test_explicit_aws_credentials_takes_precedence_over_accessor() { + // Create a mock storage options provider that should NOT be called + let mock_storage_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + // Create an accessor with the mock provider + let accessor = Arc::new(StorageOptionsAccessor::with_provider( + mock_storage_provider.clone(), + )); + + // Create an explicit AWS credentials provider + let explicit_cred_provider = Arc::new(MockAwsCredentialsProvider::default()); + + // Build credentials with both aws_credentials AND accessor + // The explicit aws_credentials should take precedence + let (result, _region) = build_aws_credential( + Duration::from_secs(300), + Some(explicit_cred_provider.clone() as AwsCredentialProvider), + None, // no storage_options + Some("us-west-2".to_string()), + Some(accessor), + ) + .await + .unwrap(); + + // Get credential from the result + let cred = result.get_credential().await.unwrap(); + + // The explicit provider should have been called (it returns empty strings) + assert!(explicit_cred_provider.called.load(Ordering::Relaxed)); + + // The storage options provider should NOT have been called + assert_eq!( + mock_storage_provider.get_call_count().await, + 0, + "Storage options provider should not be called when explicit aws_credentials is provided" + ); + + // Verify we got credentials from the explicit provider (empty strings) + assert_eq!(cred.key_id, ""); + assert_eq!(cred.secret_key, ""); + } + + #[tokio::test] + async fn test_accessor_used_when_no_explicit_aws_credentials() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let now_ms = MockClock::system_time().as_millis() as u64; + + // Create a mock storage options provider + let mock_storage_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + // Create initial options + let expires_at = now_ms + 600_000; // 10 minutes from now + let initial_options = HashMap::from([ + ( + "aws_access_key_id".to_string(), + "AKID_FROM_ACCESSOR".to_string(), + ), + ( + "aws_secret_access_key".to_string(), + "SECRET_FROM_ACCESSOR".to_string(), + ), + ( + "aws_session_token".to_string(), + "TOKEN_FROM_ACCESSOR".to_string(), + ), + ("expires_at_millis".to_string(), expires_at.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset + ]); + + // Create an accessor with initial options and provider + let accessor = Arc::new(StorageOptionsAccessor::with_initial_and_provider( + initial_options, + mock_storage_provider.clone(), + )); + + // Build credentials with accessor but NO explicit aws_credentials + let (result, _region) = build_aws_credential( + Duration::from_secs(300), + None, // no explicit aws_credentials + None, // no storage_options + Some("us-west-2".to_string()), + Some(accessor), + ) + .await + .unwrap(); + + // Get credential - should use the initial accessor credentials + let cred = result.get_credential().await.unwrap(); + assert_eq!(cred.key_id, "AKID_FROM_ACCESSOR"); + assert_eq!(cred.secret_key, "SECRET_FROM_ACCESSOR"); + + // Storage options provider should NOT have been called yet (using cached initial creds) + assert_eq!(mock_storage_provider.get_call_count().await, 0); + + // Advance time to trigger refresh (past the 5 minute refresh offset) + MockClock::set_system_time(Duration::from_secs(100_000 + 360)); + + // Get credential again - should now fetch from provider + let cred = result.get_credential().await.unwrap(); + assert_eq!(cred.key_id, "AKID_1"); + assert_eq!(cred.secret_key, "SECRET_1"); + + // Storage options provider should have been called once + assert_eq!(mock_storage_provider.get_call_count().await, 1); + } } diff --git a/rust/lance-io/src/object_store/providers/azure.rs b/rust/lance-io/src/object_store/providers/azure.rs index 7a90fc6744a..7bf566c8972 100644 --- a/rust/lance-io/src/object_store/providers/azure.rs +++ b/rust/lance-io/src/object_store/providers/azure.rs @@ -95,7 +95,7 @@ impl ObjectStoreProvider for AzureBlobStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); let mut storage_options = - StorageOptions(params.storage_options.clone().unwrap_or_default()); + StorageOptions(params.storage_options().cloned().unwrap_or_default()); storage_options.with_env_azure(); let download_retry_count = storage_options.download_retry_count(); @@ -123,6 +123,8 @@ impl ObjectStoreProvider for AzureBlobStoreProvider { io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, download_retry_count, io_tracker: Default::default(), + store_prefix: self + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } @@ -230,21 +232,24 @@ mod tests { #[tokio::test] async fn test_use_opendal_flag() { + use crate::object_store::StorageOptionsAccessor; let provider = AzureBlobStoreProvider; let url = Url::parse("az://test-container/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options: Some(HashMap::from([ - ("use_opendal".to_string(), "true".to_string()), - ("account_name".to_string(), "test_account".to_string()), - ( - "endpoint".to_string(), - "https://test_account.blob.core.windows.net".to_string(), - ), - ( - "account_key".to_string(), - "dGVzdF9hY2NvdW50X2tleQ==".to_string(), - ), - ])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( + HashMap::from([ + ("use_opendal".to_string(), "true".to_string()), + ("account_name".to_string(), "test_account".to_string()), + ( + "endpoint".to_string(), + "https://test_account.blob.core.windows.net".to_string(), + ), + ( + "account_key".to_string(), + "dGVzdF9hY2NvdW50X2tleQ==".to_string(), + ), + ]), + ))), ..Default::default() }; diff --git a/rust/lance-io/src/object_store/providers/gcp.rs b/rust/lance-io/src/object_store/providers/gcp.rs index 038015d7f4e..dba5cd8dd40 100644 --- a/rust/lance-io/src/object_store/providers/gcp.rs +++ b/rust/lance-io/src/object_store/providers/gcp.rs @@ -96,7 +96,7 @@ impl ObjectStoreProvider for GcsStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); let mut storage_options = - StorageOptions(params.storage_options.clone().unwrap_or_default()); + StorageOptions(params.storage_options().cloned().unwrap_or_default()); storage_options.with_env_gcs(); let download_retry_count = storage_options.download_retry_count(); @@ -124,6 +124,8 @@ impl ObjectStoreProvider for GcsStoreProvider { io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, download_retry_count, io_tracker: Default::default(), + store_prefix: self + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } } @@ -180,16 +182,19 @@ mod tests { #[tokio::test] async fn test_use_opendal_flag() { + use crate::object_store::StorageOptionsAccessor; let provider = GcsStoreProvider; let url = Url::parse("gs://test-bucket/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options: Some(HashMap::from([ - ("use_opendal".to_string(), "true".to_string()), - ( - "service_account".to_string(), - "test@example.iam.gserviceaccount.com".to_string(), - ), - ])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( + HashMap::from([ + ("use_opendal".to_string(), "true".to_string()), + ( + "service_account".to_string(), + "test@example.iam.gserviceaccount.com".to_string(), + ), + ]), + ))), ..Default::default() }; diff --git a/rust/lance-io/src/object_store/providers/huggingface.rs b/rust/lance-io/src/object_store/providers/huggingface.rs index c52c85a3c72..55c5f6d50b9 100644 --- a/rust/lance-io/src/object_store/providers/huggingface.rs +++ b/rust/lance-io/src/object_store/providers/huggingface.rs @@ -65,7 +65,7 @@ impl ObjectStoreProvider for HuggingfaceStoreProvider { } = parse_hf_url(&base_path)?; let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); let download_retry_count = storage_options.download_retry_count(); // Build OpenDAL config with allowed keys only. @@ -114,6 +114,8 @@ impl ObjectStoreProvider for HuggingfaceStoreProvider { io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, download_retry_count, io_tracker: Default::default(), + store_prefix: self + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } @@ -157,12 +159,13 @@ mod tests { #[test] fn storage_option_revision_takes_precedence() { + use crate::object_store::StorageOptionsAccessor; + use std::sync::Arc; let url = Url::parse("hf://datasets/acme/repo/data/file").unwrap(); let params = ObjectStoreParams { - storage_options: Some(HashMap::from([( - String::from("hf_revision"), - String::from("stable"), - )])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( + HashMap::from([(String::from("hf_revision"), String::from("stable"))]), + ))), ..Default::default() }; // new_store should accept without creating operator; test precedence via builder config @@ -175,8 +178,7 @@ mod tests { config_map.insert("repo_type".to_string(), repo_type); config_map.insert("repo".to_string(), repo_id); if let Some(rev) = params - .storage_options - .as_ref() + .storage_options() .unwrap() .get("hf_revision") .cloned() diff --git a/rust/lance-io/src/object_store/providers/local.rs b/rust/lance-io/src/object_store/providers/local.rs index 74f2777992b..78c8c9632c4 100644 --- a/rust/lance-io/src/object_store/providers/local.rs +++ b/rust/lance-io/src/object_store/providers/local.rs @@ -20,7 +20,7 @@ pub struct FileStoreProvider; impl ObjectStoreProvider for FileStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_LOCAL_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); let download_retry_count = storage_options.download_retry_count(); Ok(ObjectStore { inner: Arc::new(LocalFileSystem::new()), @@ -32,6 +32,8 @@ impl ObjectStoreProvider for FileStoreProvider { io_parallelism: DEFAULT_LOCAL_IO_PARALLELISM, download_retry_count, io_tracker: Default::default(), + store_prefix: self + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } diff --git a/rust/lance-io/src/object_store/providers/memory.rs b/rust/lance-io/src/object_store/providers/memory.rs index 9519806ed70..addc2fafc80 100644 --- a/rust/lance-io/src/object_store/providers/memory.rs +++ b/rust/lance-io/src/object_store/providers/memory.rs @@ -17,9 +17,9 @@ pub struct MemoryStoreProvider; #[async_trait::async_trait] impl ObjectStoreProvider for MemoryStoreProvider { - async fn new_store(&self, _base_path: Url, params: &ObjectStoreParams) -> Result { + async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_LOCAL_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); let download_retry_count = storage_options.download_retry_count(); Ok(ObjectStore { inner: Arc::new(InMemory::new()), @@ -31,6 +31,8 @@ impl ObjectStoreProvider for MemoryStoreProvider { io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, download_retry_count, io_tracker: Default::default(), + store_prefix: self + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } diff --git a/rust/lance-io/src/object_store/providers/oss.rs b/rust/lance-io/src/object_store/providers/oss.rs index 3437ec8d1b6..80f161b233e 100644 --- a/rust/lance-io/src/object_store/providers/oss.rs +++ b/rust/lance-io/src/object_store/providers/oss.rs @@ -22,7 +22,7 @@ pub struct OssStoreProvider; impl ObjectStoreProvider for OssStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); let bucket = base_path .host_str() @@ -103,6 +103,7 @@ impl ObjectStoreProvider for OssStoreProvider { io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, download_retry_count: storage_options.download_retry_count(), io_tracker: Default::default(), + store_prefix: self.calculate_object_store_prefix(&url, params.storage_options())?, }) } } diff --git a/rust/lance-io/src/object_store/storage_options.rs b/rust/lance-io/src/object_store/storage_options.rs index 9405f95d70c..d0f5cc20e93 100644 --- a/rust/lance-io/src/object_store/storage_options.rs +++ b/rust/lance-io/src/object_store/storage_options.rs @@ -1,25 +1,42 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -//! Storage options provider for dynamic credential fetching +//! Storage options provider and accessor for dynamic credential fetching //! -//! This module provides a trait for fetching storage options from various sources -//! (namespace servers, secret managers, etc.) with support for expiration tracking -//! and automatic refresh. +//! This module provides: +//! - [`StorageOptionsProvider`] trait for fetching storage options from various sources +//! (namespace servers, secret managers, etc.) with support for expiration tracking +//! - [`StorageOptionsAccessor`] for unified access to storage options with automatic +//! caching and refresh use std::collections::HashMap; use std::fmt; use std::sync::Arc; +use std::time::Duration; + +#[cfg(test)] +use mock_instant::thread_local::{SystemTime, UNIX_EPOCH}; + +#[cfg(not(test))] +use std::time::{SystemTime, UNIX_EPOCH}; -use crate::{Error, Result}; use async_trait::async_trait; use lance_namespace::models::DescribeTableRequest; use lance_namespace::LanceNamespace; use snafu::location; +use tokio::sync::RwLock; + +use crate::{Error, Result}; /// Key for the expiration timestamp in storage options HashMap pub const EXPIRES_AT_MILLIS_KEY: &str = "expires_at_millis"; +/// Key for the refresh offset in storage options HashMap (milliseconds before expiry to refresh) +pub const REFRESH_OFFSET_MILLIS_KEY: &str = "refresh_offset_millis"; + +/// Default refresh offset: 60 seconds before expiration +const DEFAULT_REFRESH_OFFSET_MILLIS: u64 = 60_000; + /// Trait for providing storage options with expiration tracking /// /// Implementations can fetch storage options from various sources (namespace servers, @@ -113,7 +130,7 @@ impl StorageOptionsProvider for LanceNamespaceStorageOptionsProvider { async fn fetch_storage_options(&self) -> Result>> { let request = DescribeTableRequest { id: Some(self.table_id.clone()), - version: None, + ..Default::default() }; let response = self @@ -139,3 +156,558 @@ impl StorageOptionsProvider for LanceNamespaceStorageOptionsProvider { ) } } + +/// Unified access to storage options with automatic caching and refresh +/// +/// This struct bundles static storage options with an optional dynamic provider, +/// handling all caching and refresh logic internally. It provides a single entry point +/// for accessing storage options regardless of whether they're static or dynamic. +/// +/// # Behavior +/// +/// - If only static options are provided, returns those options +/// - If a provider is configured, fetches from provider and caches results +/// - Automatically refreshes cached options before expiration (based on refresh_offset) +/// - Uses `expires_at_millis` key to track expiration +/// +/// # Thread Safety +/// +/// The accessor is thread-safe and can be shared across multiple tasks. +/// Concurrent refresh attempts are deduplicated using a try-lock mechanism. +pub struct StorageOptionsAccessor { + /// Initial/fallback static storage options + initial_options: Option>, + + /// Optional dynamic provider for refreshing options + provider: Option>, + + /// Cached storage options with expiration tracking + cache: Arc>>, + + /// Duration before expiry to trigger refresh + refresh_offset: Duration, +} + +impl fmt::Debug for StorageOptionsAccessor { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("StorageOptionsAccessor") + .field("has_initial_options", &self.initial_options.is_some()) + .field("has_provider", &self.provider.is_some()) + .field("refresh_offset", &self.refresh_offset) + .finish() + } +} + +#[derive(Debug, Clone)] +struct CachedStorageOptions { + options: HashMap, + expires_at_millis: Option, +} + +impl StorageOptionsAccessor { + /// Extract refresh offset from storage options, or use default + fn extract_refresh_offset(options: &HashMap) -> Duration { + options + .get(REFRESH_OFFSET_MILLIS_KEY) + .and_then(|s| s.parse::().ok()) + .map(Duration::from_millis) + .unwrap_or(Duration::from_millis(DEFAULT_REFRESH_OFFSET_MILLIS)) + } + + /// Create an accessor with only static options (no refresh capability) + /// + /// The returned accessor will always return the provided options. + /// This is useful when credentials don't expire or are managed externally. + pub fn with_static_options(options: HashMap) -> Self { + let expires_at_millis = options + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()); + let refresh_offset = Self::extract_refresh_offset(&options); + + Self { + initial_options: Some(options.clone()), + provider: None, + cache: Arc::new(RwLock::new(Some(CachedStorageOptions { + options, + expires_at_millis, + }))), + refresh_offset, + } + } + + /// Create an accessor with a dynamic provider (no initial options) + /// + /// The accessor will fetch from the provider on first access and cache + /// the results. Refresh happens automatically before expiration. + /// Uses the default refresh offset (60 seconds) until options are fetched. + /// + /// # Arguments + /// * `provider` - The storage options provider for fetching fresh options + pub fn with_provider(provider: Arc) -> Self { + Self { + initial_options: None, + provider: Some(provider), + cache: Arc::new(RwLock::new(None)), + refresh_offset: Duration::from_millis(DEFAULT_REFRESH_OFFSET_MILLIS), + } + } + + /// Create an accessor with initial options and a dynamic provider + /// + /// Initial options are used until they expire, then the provider is called. + /// This avoids an immediate fetch when initial credentials are still valid. + /// The `refresh_offset_millis` key in initial_options controls refresh timing. + /// + /// # Arguments + /// * `initial_options` - Initial storage options to cache + /// * `provider` - The storage options provider for refreshing + pub fn with_initial_and_provider( + initial_options: HashMap, + provider: Arc, + ) -> Self { + let expires_at_millis = initial_options + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()); + let refresh_offset = Self::extract_refresh_offset(&initial_options); + + Self { + initial_options: Some(initial_options.clone()), + provider: Some(provider), + cache: Arc::new(RwLock::new(Some(CachedStorageOptions { + options: initial_options, + expires_at_millis, + }))), + refresh_offset, + } + } + + /// Get current valid storage options + /// + /// - Returns cached options if not expired + /// - Fetches from provider if expired or not cached + /// - Falls back to initial_options if provider returns None + /// + /// # Errors + /// + /// Returns an error if: + /// - The provider fails to fetch options + /// - No options are available (no cache, no provider, no initial options) + pub async fn get_storage_options(&self) -> Result { + loop { + match self.do_get_storage_options().await? { + Some(options) => return Ok(options), + None => { + // Lock was busy, wait 10ms before retrying + tokio::time::sleep(Duration::from_millis(10)).await; + continue; + } + } + } + } + + async fn do_get_storage_options(&self) -> Result> { + // Check if we have valid cached options with read lock + { + let cached = self.cache.read().await; + if !self.needs_refresh(&cached) { + if let Some(cached_opts) = &*cached { + return Ok(Some(super::StorageOptions(cached_opts.options.clone()))); + } + } + } + + // If no provider, return initial options or error + let Some(provider) = &self.provider else { + return if let Some(initial) = &self.initial_options { + Ok(Some(super::StorageOptions(initial.clone()))) + } else { + Err(Error::IO { + source: Box::new(std::io::Error::other("No storage options available")), + location: location!(), + }) + }; + }; + + // Try to acquire write lock - if it fails, return None and let caller retry + let Ok(mut cache) = self.cache.try_write() else { + return Ok(None); + }; + + // Double-check if options are still stale after acquiring write lock + // (another thread might have refreshed them) + if !self.needs_refresh(&cache) { + if let Some(cached_opts) = &*cache { + return Ok(Some(super::StorageOptions(cached_opts.options.clone()))); + } + } + + log::debug!( + "Refreshing storage options from provider: {}", + provider.provider_id() + ); + + let storage_options_map = + provider + .fetch_storage_options() + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to fetch storage options: {}", + e + ))), + location: location!(), + })?; + + let Some(options) = storage_options_map else { + // Provider returned None, fall back to initial options + if let Some(initial) = &self.initial_options { + return Ok(Some(super::StorageOptions(initial.clone()))); + } + return Err(Error::IO { + source: Box::new(std::io::Error::other( + "Provider returned no storage options", + )), + location: location!(), + }); + }; + + let expires_at_millis = options + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()); + + if let Some(expires_at) = expires_at_millis { + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or(Duration::from_secs(0)) + .as_millis() as u64; + let expires_in_secs = (expires_at.saturating_sub(now_ms)) / 1000; + log::debug!( + "Successfully refreshed storage options from provider: {}, options expire in {} seconds", + provider.provider_id(), + expires_in_secs + ); + } else { + log::debug!( + "Successfully refreshed storage options from provider: {} (no expiration)", + provider.provider_id() + ); + } + + *cache = Some(CachedStorageOptions { + options: options.clone(), + expires_at_millis, + }); + + Ok(Some(super::StorageOptions(options))) + } + + fn needs_refresh(&self, cached: &Option) -> bool { + match cached { + None => true, + Some(cached_opts) => { + if let Some(expires_at_millis) = cached_opts.expires_at_millis { + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or(Duration::from_secs(0)) + .as_millis() as u64; + + // Refresh if we're within the refresh offset of expiration + let refresh_offset_millis = self.refresh_offset.as_millis() as u64; + now_ms + refresh_offset_millis >= expires_at_millis + } else { + // No expiration means options never expire + false + } + } + } + } + + /// Get the initial storage options without refresh + /// + /// Returns the initial options that were provided when creating the accessor. + /// This does not trigger any refresh, even if the options have expired. + pub fn initial_storage_options(&self) -> Option<&HashMap> { + self.initial_options.as_ref() + } + + /// Get the accessor ID for equality/hashing + /// + /// Returns the provider_id if a provider exists, otherwise generates + /// a stable ID from the initial options hash. + pub fn accessor_id(&self) -> String { + if let Some(provider) = &self.provider { + provider.provider_id() + } else if let Some(initial) = &self.initial_options { + // Generate a stable ID from initial options + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let mut hasher = DefaultHasher::new(); + let mut keys: Vec<_> = initial.keys().collect(); + keys.sort(); + for key in keys { + key.hash(&mut hasher); + initial.get(key).hash(&mut hasher); + } + format!("static_options_{:x}", hasher.finish()) + } else { + "empty_accessor".to_string() + } + } + + /// Check if this accessor has a dynamic provider + pub fn has_provider(&self) -> bool { + self.provider.is_some() + } + + /// Get the refresh offset duration + pub fn refresh_offset(&self) -> Duration { + self.refresh_offset + } + + /// Get the storage options provider, if any + pub fn provider(&self) -> Option<&Arc> { + self.provider.as_ref() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use mock_instant::thread_local::MockClock; + + #[derive(Debug)] + struct MockStorageOptionsProvider { + call_count: Arc>, + expires_in_millis: Option, + } + + impl MockStorageOptionsProvider { + fn new(expires_in_millis: Option) -> Self { + Self { + call_count: Arc::new(RwLock::new(0)), + expires_in_millis, + } + } + + async fn get_call_count(&self) -> usize { + *self.call_count.read().await + } + } + + #[async_trait] + impl StorageOptionsProvider for MockStorageOptionsProvider { + async fn fetch_storage_options(&self) -> Result>> { + let count = { + let mut c = self.call_count.write().await; + *c += 1; + *c + }; + + let mut options = HashMap::from([ + ("aws_access_key_id".to_string(), format!("AKID_{}", count)), + ( + "aws_secret_access_key".to_string(), + format!("SECRET_{}", count), + ), + ("aws_session_token".to_string(), format!("TOKEN_{}", count)), + ]); + + if let Some(expires_in) = self.expires_in_millis { + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() as u64; + let expires_at = now_ms + expires_in; + options.insert(EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()); + } + + Ok(Some(options)) + } + + fn provider_id(&self) -> String { + let ptr = Arc::as_ptr(&self.call_count) as usize; + format!("MockStorageOptionsProvider {{ id: {} }}", ptr) + } + } + + #[tokio::test] + async fn test_static_options_only() { + let options = HashMap::from([ + ("key1".to_string(), "value1".to_string()), + ("key2".to_string(), "value2".to_string()), + ]); + let accessor = StorageOptionsAccessor::with_static_options(options.clone()); + + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0, options); + assert!(!accessor.has_provider()); + assert_eq!(accessor.initial_storage_options(), Some(&options)); + } + + #[tokio::test] + async fn test_provider_only() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + let accessor = StorageOptionsAccessor::with_provider(mock_provider.clone()); + + let result = accessor.get_storage_options().await.unwrap(); + assert!(result.0.contains_key("aws_access_key_id")); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + assert!(accessor.has_provider()); + assert_eq!(accessor.initial_storage_options(), None); + assert_eq!(mock_provider.get_call_count().await, 1); + } + + #[tokio::test] + async fn test_initial_and_provider_uses_initial_first() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let now_ms = MockClock::system_time().as_millis() as u64; + let expires_at = now_ms + 600_000; // 10 minutes from now + + let initial = HashMap::from([ + ("aws_access_key_id".to_string(), "INITIAL_KEY".to_string()), + ( + "aws_secret_access_key".to_string(), + "INITIAL_SECRET".to_string(), + ), + (EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()), + ]); + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + let accessor = StorageOptionsAccessor::with_initial_and_provider( + initial.clone(), + mock_provider.clone(), + ); + + // First call uses initial + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "INITIAL_KEY"); + assert_eq!(mock_provider.get_call_count().await, 0); // Provider not called yet + } + + #[tokio::test] + async fn test_caching_and_refresh() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); // 10 min expiry + // Use with_initial_and_provider to set custom refresh_offset_millis (5 min = 300000ms) + let now_ms = MockClock::system_time().as_millis() as u64; + let expires_at = now_ms + 600_000; // 10 minutes from now + let initial = HashMap::from([ + (EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()), + (REFRESH_OFFSET_MILLIS_KEY.to_string(), "300000".to_string()), // 5 min refresh offset + ]); + let accessor = + StorageOptionsAccessor::with_initial_and_provider(initial, mock_provider.clone()); + + // First call uses initial cached options + let result = accessor.get_storage_options().await.unwrap(); + assert!(result.0.contains_key(EXPIRES_AT_MILLIS_KEY)); + assert_eq!(mock_provider.get_call_count().await, 0); + + // Advance time to 6 minutes - should trigger refresh (within 5 min refresh offset) + MockClock::set_system_time(Duration::from_secs(100_000 + 360)); + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + assert_eq!(mock_provider.get_call_count().await, 1); + } + + #[tokio::test] + async fn test_expired_initial_triggers_refresh() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let now_ms = MockClock::system_time().as_millis() as u64; + let expired_time = now_ms - 1_000; // Expired 1 second ago + + let initial = HashMap::from([ + ("aws_access_key_id".to_string(), "EXPIRED_KEY".to_string()), + (EXPIRES_AT_MILLIS_KEY.to_string(), expired_time.to_string()), + ]); + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + let accessor = + StorageOptionsAccessor::with_initial_and_provider(initial, mock_provider.clone()); + + // Should fetch from provider since initial is expired + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + assert_eq!(mock_provider.get_call_count().await, 1); + } + + #[tokio::test] + async fn test_accessor_id_with_provider() { + let mock_provider = Arc::new(MockStorageOptionsProvider::new(None)); + let accessor = StorageOptionsAccessor::with_provider(mock_provider); + + let id = accessor.accessor_id(); + assert!(id.starts_with("MockStorageOptionsProvider")); + } + + #[tokio::test] + async fn test_accessor_id_static() { + let options = HashMap::from([("key".to_string(), "value".to_string())]); + let accessor = StorageOptionsAccessor::with_static_options(options); + + let id = accessor.accessor_id(); + assert!(id.starts_with("static_options_")); + } + + #[tokio::test] + async fn test_concurrent_access() { + // Create a mock provider with far future expiration + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(9999999999999))); + + let accessor = Arc::new(StorageOptionsAccessor::with_provider(mock_provider.clone())); + + // Spawn 10 concurrent tasks that all try to get options at the same time + let mut handles = vec![]; + for i in 0..10 { + let acc = accessor.clone(); + let handle = tokio::spawn(async move { + let result = acc.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + i + }); + handles.push(handle); + } + + // Wait for all tasks to complete + let results: Vec<_> = futures::future::join_all(handles) + .await + .into_iter() + .map(|r| r.unwrap()) + .collect(); + + // Verify all 10 tasks completed successfully + assert_eq!(results.len(), 10); + + // The provider should have been called exactly once + let call_count = mock_provider.get_call_count().await; + assert_eq!( + call_count, 1, + "Provider should be called exactly once despite concurrent access" + ); + } + + #[tokio::test] + async fn test_no_expiration_never_refreshes() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let mock_provider = Arc::new(MockStorageOptionsProvider::new(None)); // No expiration + let accessor = StorageOptionsAccessor::with_provider(mock_provider.clone()); + + // First call fetches + accessor.get_storage_options().await.unwrap(); + assert_eq!(mock_provider.get_call_count().await, 1); + + // Advance time significantly + MockClock::set_system_time(Duration::from_secs(200_000)); + + // Should still use cached options + accessor.get_storage_options().await.unwrap(); + assert_eq!(mock_provider.get_call_count().await, 1); + } +} diff --git a/rust/lance-namespace-impls/Cargo.toml b/rust/lance-namespace-impls/Cargo.toml index 9ce32692ffc..b41e7f44e01 100644 --- a/rust/lance-namespace-impls/Cargo.toml +++ b/rust/lance-namespace-impls/Cargo.toml @@ -13,7 +13,7 @@ rust-version.workspace = true [features] default = ["dir-aws", "dir-azure", "dir-gcp", "dir-oss", "dir-huggingface"] -rest = ["dep:reqwest"] +rest = ["dep:reqwest", "dep:serde"] rest-adapter = ["dep:axum", "dep:tower", "dep:tower-http", "dep:serde"] # Cloud storage features for directory implementation - align with lance-io dir-gcp = ["lance-io/gcp", "lance/gcp"] @@ -21,6 +21,10 @@ dir-aws = ["lance-io/aws", "lance/aws"] dir-azure = ["lance-io/azure", "lance/azure"] dir-oss = ["lance-io/oss", "lance/oss"] dir-huggingface = ["lance-io/huggingface", "lance/huggingface"] +# Credential vending features +credential-vendor-aws = ["dep:aws-sdk-sts", "dep:aws-config", "dep:aws-credential-types", "dep:sha2", "dep:base64"] +credential-vendor-gcp = ["dep:google-cloud-auth", "dep:reqwest", "dep:serde", "dep:sha2", "dep:base64"] +credential-vendor-azure = ["dep:azure_core", "dep:azure_identity", "dep:azure_storage", "dep:azure_storage_blobs", "dep:time", "dep:sha2", "dep:base64", "dep:reqwest"] [dependencies] lance-namespace.workspace = true @@ -60,6 +64,24 @@ serde_json = { workspace = true } futures.workspace = true log.workspace = true rand.workspace = true +chrono.workspace = true + +# AWS credential vending dependencies (optional, enabled by "credential-vendor-aws" feature) +aws-sdk-sts = { version = "1.38.0", optional = true } +aws-config = { workspace = true, optional = true } +aws-credential-types = { workspace = true, optional = true } +sha2 = { version = "0.10", optional = true } +base64 = { version = "0.22", optional = true } + +# GCP credential vending dependencies (optional, enabled by "dir-gcp" feature) +google-cloud-auth = { version = "0.18", optional = true } + +# Azure credential vending dependencies (optional, enabled by "dir-azure" feature) +azure_core = { version = "0.21", optional = true } +azure_identity = { version = "0.21", optional = true } +azure_storage = { version = "0.21", optional = true } +azure_storage_blobs = { version = "0.21", optional = true } +time = { version = "0.3", optional = true } [dev-dependencies] tokio = { workspace = true, features = ["full"] } diff --git a/rust/lance-namespace-impls/src/connect.rs b/rust/lance-namespace-impls/src/connect.rs index aa84e2fd6c1..ba26fda3643 100644 --- a/rust/lance-namespace-impls/src/connect.rs +++ b/rust/lance-namespace-impls/src/connect.rs @@ -10,6 +10,8 @@ use lance::session::Session; use lance_core::{Error, Result}; use lance_namespace::LanceNamespace; +use crate::context::DynamicContextProvider; + /// Builder for creating Lance namespace connections. /// /// This builder provides a fluent API for configuring and establishing @@ -46,11 +48,53 @@ use lance_namespace::LanceNamespace; /// # Ok(()) /// # } /// ``` -#[derive(Debug, Clone)] +/// +/// ## With Dynamic Context Provider +/// +/// ```no_run +/// # use lance_namespace_impls::{ConnectBuilder, DynamicContextProvider, OperationInfo}; +/// # use std::collections::HashMap; +/// # use std::sync::Arc; +/// # async fn example() -> Result<(), Box> { +/// #[derive(Debug)] +/// struct MyProvider; +/// +/// impl DynamicContextProvider for MyProvider { +/// fn provide_context(&self, info: &OperationInfo) -> HashMap { +/// let mut ctx = HashMap::new(); +/// ctx.insert("headers.Authorization".to_string(), "Bearer token".to_string()); +/// ctx +/// } +/// } +/// +/// let namespace = ConnectBuilder::new("rest") +/// .property("uri", "https://api.example.com") +/// .context_provider(Arc::new(MyProvider)) +/// .connect() +/// .await?; +/// # Ok(()) +/// # } +/// ``` +#[derive(Clone)] pub struct ConnectBuilder { impl_name: String, properties: HashMap, session: Option>, + context_provider: Option>, +} + +impl std::fmt::Debug for ConnectBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ConnectBuilder") + .field("impl_name", &self.impl_name) + .field("properties", &self.properties) + .field("session", &self.session) + .field( + "context_provider", + &self.context_provider.as_ref().map(|_| "Some(...)"), + ) + .finish() + } } impl ConnectBuilder { @@ -64,6 +108,7 @@ impl ConnectBuilder { impl_name: impl_name.into(), properties: HashMap::new(), session: None, + context_provider: None, } } @@ -102,6 +147,20 @@ impl ConnectBuilder { self } + /// Set a dynamic context provider for per-request context. + /// + /// The provider will be called before each operation to generate + /// additional context. For RestNamespace, context keys that start with + /// `headers.` are converted to HTTP headers by stripping the prefix. + /// + /// # Arguments + /// + /// * `provider` - The context provider implementation + pub fn context_provider(mut self, provider: Arc) -> Self { + self.context_provider = Some(provider); + self + } + /// Build and establish the connection to the namespace. /// /// # Returns @@ -119,8 +178,12 @@ impl ConnectBuilder { #[cfg(feature = "rest")] "rest" => { // Create REST implementation (REST doesn't use session) - crate::rest::RestNamespaceBuilder::from_properties(self.properties) - .map(|builder| Arc::new(builder.build()) as Arc) + let mut builder = + crate::rest::RestNamespaceBuilder::from_properties(self.properties)?; + if let Some(provider) = self.context_provider { + builder = builder.context_provider(provider); + } + Ok(Arc::new(builder.build()) as Arc) } #[cfg(not(feature = "rest"))] "rest" => Err(Error::Namespace { @@ -130,13 +193,17 @@ impl ConnectBuilder { }), "dir" => { // Create directory implementation (always available) - crate::dir::DirectoryNamespaceBuilder::from_properties( + let mut builder = crate::dir::DirectoryNamespaceBuilder::from_properties( self.properties, self.session, - )? - .build() - .await - .map(|ns| Arc::new(ns) as Arc) + )?; + if let Some(provider) = self.context_provider { + builder = builder.context_provider(provider); + } + builder + .build() + .await + .map(|ns| Arc::new(ns) as Arc) } _ => Err(Error::Namespace { source: format!( diff --git a/rust/lance-namespace-impls/src/context.rs b/rust/lance-namespace-impls/src/context.rs new file mode 100644 index 00000000000..028eb342bac --- /dev/null +++ b/rust/lance-namespace-impls/src/context.rs @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Dynamic context provider for per-request context overrides. +//! +//! This module provides the [`DynamicContextProvider`] trait that enables +//! per-request context injection (e.g., dynamic authentication headers). +//! +//! ## Usage +//! +//! Implement the trait and pass to namespace builders: +//! +//! ```ignore +//! use lance_namespace_impls::{RestNamespaceBuilder, DynamicContextProvider, OperationInfo}; +//! use std::collections::HashMap; +//! use std::sync::Arc; +//! +//! #[derive(Debug)] +//! struct MyProvider; +//! +//! impl DynamicContextProvider for MyProvider { +//! fn provide_context(&self, info: &OperationInfo) -> HashMap { +//! let mut context = HashMap::new(); +//! context.insert("headers.Authorization".to_string(), format!("Bearer {}", get_current_token())); +//! context.insert("headers.X-Request-Id".to_string(), generate_request_id()); +//! context +//! } +//! } +//! +//! let namespace = RestNamespaceBuilder::new("https://api.example.com") +//! .context_provider(Arc::new(MyProvider)) +//! .build(); +//! ``` +//! +//! For RestNamespace, context keys that start with `headers.` are converted to HTTP headers +//! by stripping the prefix. For example, `{"headers.Authorization": "Bearer abc123"}` +//! becomes the `Authorization: Bearer abc123` header. Keys without the `headers.` prefix +//! are ignored for HTTP headers but may be used for other purposes. + +use std::collections::HashMap; + +/// Information about the namespace operation being executed. +/// +/// This is passed to the [`DynamicContextProvider`] to allow it to make +/// context decisions based on the operation. +#[derive(Debug, Clone)] +pub struct OperationInfo { + /// The operation name (e.g., "list_tables", "describe_table", "create_namespace") + pub operation: String, + /// The object ID for the operation (namespace or table identifier). + /// This is the delimited string form, e.g., "workspace$table_name". + pub object_id: String, +} + +impl OperationInfo { + /// Create a new OperationInfo. + pub fn new(operation: impl Into, object_id: impl Into) -> Self { + Self { + operation: operation.into(), + object_id: object_id.into(), + } + } +} + +/// Trait for providing dynamic request context. +/// +/// Implementations can generate per-request context (e.g., authentication headers) +/// based on the operation being performed. The provider is called synchronously +/// before each namespace operation. +/// +/// For RestNamespace, context keys that start with `headers.` are converted to +/// HTTP headers by stripping the prefix. For example, `{"headers.Authorization": "Bearer token"}` +/// becomes the `Authorization: Bearer token` header. +/// +/// ## Thread Safety +/// +/// Implementations must be `Send + Sync` as the provider may be called from +/// multiple threads concurrently. +/// +/// ## Error Handling +/// +/// If the provider needs to signal an error, it should return an empty HashMap +/// and log the error. The namespace operation will proceed without the +/// additional context. +pub trait DynamicContextProvider: Send + Sync + std::fmt::Debug { + /// Provide context for a namespace operation. + /// + /// # Arguments + /// + /// * `info` - Information about the operation being performed + /// + /// # Returns + /// + /// Returns a HashMap of context key-value pairs. For HTTP headers, use keys + /// with the `headers.` prefix (e.g., `headers.Authorization`). + /// Returns an empty HashMap if no additional context is needed. + fn provide_context(&self, info: &OperationInfo) -> HashMap; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[derive(Debug)] + struct MockContextProvider { + prefix: String, + } + + impl DynamicContextProvider for MockContextProvider { + fn provide_context(&self, info: &OperationInfo) -> HashMap { + let mut context = HashMap::new(); + context.insert( + "test-header".to_string(), + format!("{}-{}", self.prefix, info.operation), + ); + context.insert("object-id".to_string(), info.object_id.clone()); + context + } + } + + #[test] + fn test_operation_info_creation() { + let info = OperationInfo::new("describe_table", "workspace$my_table"); + assert_eq!(info.operation, "describe_table"); + assert_eq!(info.object_id, "workspace$my_table"); + } + + #[test] + fn test_context_provider_basic() { + let provider = MockContextProvider { + prefix: "test".to_string(), + }; + + let info = OperationInfo::new("list_tables", "workspace$ns"); + + let context = provider.provide_context(&info); + assert_eq!( + context.get("test-header"), + Some(&"test-list_tables".to_string()) + ); + assert_eq!(context.get("object-id"), Some(&"workspace$ns".to_string())); + } + + #[test] + fn test_empty_context() { + #[derive(Debug)] + struct EmptyProvider; + + impl DynamicContextProvider for EmptyProvider { + fn provide_context(&self, _info: &OperationInfo) -> HashMap { + HashMap::new() + } + } + + let provider = EmptyProvider; + let info = OperationInfo::new("list_tables", "ns"); + + let context = provider.provide_context(&info); + assert!(context.is_empty()); + } +} diff --git a/rust/lance-namespace-impls/src/credentials.rs b/rust/lance-namespace-impls/src/credentials.rs new file mode 100644 index 00000000000..f9f7ecc7950 --- /dev/null +++ b/rust/lance-namespace-impls/src/credentials.rs @@ -0,0 +1,795 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Credential vending for cloud storage access. +//! +//! This module provides credential vending functionality that generates +//! temporary, scoped credentials for accessing cloud storage. Similar to +//! Apache Polaris's credential vending, it supports: +//! +//! - **AWS**: STS AssumeRole with scoped IAM policies (requires `credential-vendor-aws` feature) +//! - **GCP**: OAuth2 tokens with access boundaries (requires `credential-vendor-gcp` feature) +//! - **Azure**: SAS tokens with user delegation keys (requires `credential-vendor-azure` feature) +//! +//! The appropriate vendor is automatically selected based on the table location URI scheme: +//! - `s3://` for AWS +//! - `gs://` for GCP +//! - `az://` for Azure +//! +//! ## Configuration via Properties +//! +//! Credential vendors are configured via properties with the `credential_vendor.` prefix. +//! +//! ### Properties format: +//! +//! ```text +//! # Required to enable credential vending +//! credential_vendor.enabled = "true" +//! +//! # Common properties (apply to all providers) +//! credential_vendor.permission = "read" # read, write, or admin (default: read) +//! +//! # AWS-specific properties (for s3:// locations) +//! credential_vendor.aws_role_arn = "arn:aws:iam::123456789012:role/MyRole" # required for AWS +//! credential_vendor.aws_external_id = "my-external-id" +//! credential_vendor.aws_region = "us-west-2" +//! credential_vendor.aws_role_session_name = "my-session" +//! credential_vendor.aws_duration_millis = "3600000" # 1 hour (default, range: 15min-12hrs) +//! +//! # GCP-specific properties (for gs:// locations) +//! # Note: GCP token duration cannot be configured; it's determined by the STS endpoint +//! # To use a service account key file, set GOOGLE_APPLICATION_CREDENTIALS env var before starting +//! credential_vendor.gcp_service_account = "my-sa@project.iam.gserviceaccount.com" +//! +//! # Azure-specific properties (for az:// locations) +//! credential_vendor.azure_account_name = "mystorageaccount" # required for Azure +//! credential_vendor.azure_tenant_id = "my-tenant-id" +//! credential_vendor.azure_duration_millis = "3600000" # 1 hour (default, up to 7 days) +//! ``` +//! +//! ### Example using ConnectBuilder: +//! +//! ```ignore +//! ConnectBuilder::new("dir") +//! .property("root", "s3://bucket/path") +//! .property("credential_vendor.enabled", "true") +//! .property("credential_vendor.aws_role_arn", "arn:aws:iam::123456789012:role/MyRole") +//! .property("credential_vendor.permission", "read") +//! .connect() +//! .await?; +//! ``` + +#[cfg(feature = "credential-vendor-aws")] +pub mod aws; + +#[cfg(feature = "credential-vendor-azure")] +pub mod azure; + +#[cfg(feature = "credential-vendor-gcp")] +pub mod gcp; + +/// Credential caching module. +/// Available when any credential vendor feature is enabled. +#[cfg(any( + feature = "credential-vendor-aws", + feature = "credential-vendor-azure", + feature = "credential-vendor-gcp" +))] +pub mod cache; + +use std::collections::HashMap; +use std::str::FromStr; + +use async_trait::async_trait; +use lance_core::Result; +use lance_io::object_store::uri_to_url; +use lance_namespace::models::Identity; + +/// Default credential duration: 1 hour (3600000 milliseconds) +pub const DEFAULT_CREDENTIAL_DURATION_MILLIS: u64 = 3600 * 1000; + +/// Redact a credential string for logging, showing first and last few characters. +/// +/// This is useful for debugging while avoiding exposure of sensitive data. +/// Format: `AKIAIOSF***MPLE` (first 8 + "***" + last 4) +/// +/// Shows 8 characters at the start (useful since AWS keys always start with AKIA/ASIA) +/// and 4 characters at the end. For short strings, shows only the first few with "***". +/// +/// # Security Note +/// +/// This function should only be used for identifiers and tokens, never for secrets +/// like `aws_secret_access_key` which should never be logged even in redacted form. +pub fn redact_credential(credential: &str) -> String { + const SHOW_START: usize = 8; + const SHOW_END: usize = 4; + const MIN_LENGTH_FOR_BOTH_ENDS: usize = SHOW_START + SHOW_END + 4; // Need at least 16 chars + + if credential.is_empty() { + return "[empty]".to_string(); + } + + if credential.len() < MIN_LENGTH_FOR_BOTH_ENDS { + // For short credentials, just show beginning + let show = credential.len().min(SHOW_START); + format!("{}***", &credential[..show]) + } else { + // Show first 8 and last 4 characters + format!( + "{}***{}", + &credential[..SHOW_START], + &credential[credential.len() - SHOW_END..] + ) + } +} + +/// Permission level for vended credentials. +/// +/// This determines what access the vended credentials will have: +/// - `Read`: Read-only access to all table content +/// - `Write`: Full read and write access (no delete) +/// - `Admin`: Full read, write, and delete access +/// +/// Permission enforcement by cloud provider: +/// - **AWS**: Permissions are enforced via scoped IAM policies attached to the AssumeRole request +/// - **Azure**: Permissions are enforced via SAS token permissions +/// - **GCP**: Permissions are enforced via Credential Access Boundaries (CAB) that downscope +/// the OAuth2 token to specific GCS IAM roles +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum VendedPermission { + /// Read-only access to all table content (metadata, indices, data files) + #[default] + Read, + /// Full read and write access (no delete) + /// This is intended ONLY for testing purposes to generate a write-only permission set. + /// Technically, any user with write permission could "delete" the file by + /// overwriting the file with empty content. + /// So this cannot really prevent malicious use cases. + Write, + /// Full read, write, and delete access + Admin, +} + +impl VendedPermission { + /// Returns true if this permission allows writing + pub fn can_write(&self) -> bool { + matches!(self, Self::Write | Self::Admin) + } + + /// Returns true if this permission allows deleting + pub fn can_delete(&self) -> bool { + matches!(self, Self::Admin) + } +} + +impl FromStr for VendedPermission { + type Err = String; + + fn from_str(s: &str) -> std::result::Result { + match s.to_lowercase().as_str() { + "read" => Ok(Self::Read), + "write" => Ok(Self::Write), + "admin" => Ok(Self::Admin), + _ => Err(format!( + "Invalid permission '{}'. Must be one of: read, write, admin", + s + )), + } + } +} + +impl std::fmt::Display for VendedPermission { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Read => write!(f, "read"), + Self::Write => write!(f, "write"), + Self::Admin => write!(f, "admin"), + } + } +} + +/// Property key prefix for credential vendor properties. +/// Properties with this prefix are stripped when using `from_properties`. +pub const PROPERTY_PREFIX: &str = "credential_vendor."; + +/// Common property key to explicitly enable credential vending (short form). +pub const ENABLED: &str = "enabled"; + +/// Common property key for permission level (short form). +pub const PERMISSION: &str = "permission"; + +/// Common property key to enable credential caching (short form). +/// Default: true. Set to "false" to disable caching. +pub const CACHE_ENABLED: &str = "cache_enabled"; + +/// Common property key for API key salt (short form). +/// Used to hash API keys before comparison: SHA256(api_key + ":" + salt) +pub const API_KEY_SALT: &str = "api_key_salt"; + +/// Property key prefix for API key hash to permission mappings (short form). +/// Format: `api_key_hash. = ""` +pub const API_KEY_HASH_PREFIX: &str = "api_key_hash."; + +/// AWS-specific property keys (short form, without prefix) +#[cfg(feature = "credential-vendor-aws")] +pub mod aws_props { + pub const ROLE_ARN: &str = "aws_role_arn"; + pub const EXTERNAL_ID: &str = "aws_external_id"; + pub const REGION: &str = "aws_region"; + pub const ROLE_SESSION_NAME: &str = "aws_role_session_name"; + /// AWS credential duration in milliseconds. + /// Default: 3600000 (1 hour). Range: 900000 (15 min) to 43200000 (12 hours). + pub const DURATION_MILLIS: &str = "aws_duration_millis"; +} + +/// GCP-specific property keys (short form, without prefix) +#[cfg(feature = "credential-vendor-gcp")] +pub mod gcp_props { + pub const SERVICE_ACCOUNT: &str = "gcp_service_account"; + + /// Workload Identity Provider resource name for OIDC token exchange. + /// Format: //iam.googleapis.com/projects/{project}/locations/global/workloadIdentityPools/{pool}/providers/{provider} + pub const WORKLOAD_IDENTITY_PROVIDER: &str = "gcp_workload_identity_provider"; + + /// Service account to impersonate after Workload Identity Federation (optional). + /// If not set, uses the federated identity directly. + pub const IMPERSONATION_SERVICE_ACCOUNT: &str = "gcp_impersonation_service_account"; +} + +/// Azure-specific property keys (short form, without prefix) +#[cfg(feature = "credential-vendor-azure")] +pub mod azure_props { + pub const TENANT_ID: &str = "azure_tenant_id"; + /// Azure storage account name. Required for credential vending. + pub const ACCOUNT_NAME: &str = "azure_account_name"; + /// Azure credential duration in milliseconds. + /// Default: 3600000 (1 hour). Azure SAS tokens can be valid up to 7 days. + pub const DURATION_MILLIS: &str = "azure_duration_millis"; + + /// Client ID of the Azure AD App Registration for Workload Identity Federation. + /// Required when using auth_token identity for OIDC token exchange. + pub const FEDERATED_CLIENT_ID: &str = "azure_federated_client_id"; +} + +/// Vended credentials with expiration information. +#[derive(Clone)] +pub struct VendedCredentials { + /// Storage options map containing credential keys. + /// - For AWS: `aws_access_key_id`, `aws_secret_access_key`, `aws_session_token` + /// - For GCP: `google_storage_token` + /// - For Azure: `azure_storage_sas_token`, `azure_storage_account_name` + pub storage_options: HashMap, + + /// Expiration time in milliseconds since Unix epoch. + pub expires_at_millis: u64, +} + +impl std::fmt::Debug for VendedCredentials { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("VendedCredentials") + .field( + "storage_options", + &format!("[{} keys redacted]", self.storage_options.len()), + ) + .field("expires_at_millis", &self.expires_at_millis) + .finish() + } +} + +impl VendedCredentials { + /// Create new vended credentials. + pub fn new(storage_options: HashMap, expires_at_millis: u64) -> Self { + Self { + storage_options, + expires_at_millis, + } + } + + /// Check if the credentials have expired. + pub fn is_expired(&self) -> bool { + let now_millis = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .expect("time went backwards") + .as_millis() as u64; + now_millis >= self.expires_at_millis + } +} + +/// Trait for credential vendors that generate temporary credentials. +/// +/// Each cloud provider has its own configuration passed via the vendor +/// implementation. The permission level is configured at vendor creation time +/// via [`VendedPermission`]. +#[async_trait] +pub trait CredentialVendor: Send + Sync + std::fmt::Debug { + /// Vend credentials for accessing the specified table location. + /// + /// The permission level (read/write/admin) is determined by the vendor's + /// configuration, not per-request. When identity is provided, the vendor + /// may use different authentication flows: + /// + /// - `auth_token`: Use AssumeRoleWithWebIdentity (AWS validates the token) + /// - `api_key`: Validate against configured API key hashes and use AssumeRole + /// - `None`: Use static configuration with AssumeRole + /// + /// # Arguments + /// + /// * `table_location` - The table URI to vend credentials for + /// * `identity` - Optional identity from the request (api_key OR auth_token, mutually exclusive) + /// + /// # Returns + /// + /// Returns vended credentials with expiration information. + /// + /// # Errors + /// + /// Returns error if identity validation fails (no fallback to static config). + async fn vend_credentials( + &self, + table_location: &str, + identity: Option<&Identity>, + ) -> Result; + + /// Returns the cloud provider name (e.g., "aws", "gcp", "azure"). + fn provider_name(&self) -> &'static str; + + /// Returns the permission level configured for this vendor. + fn permission(&self) -> VendedPermission; +} + +/// Detect the cloud provider from a URI scheme. +/// +/// Supported schemes for credential vending: +/// - AWS S3: `s3://` +/// - GCP GCS: `gs://` +/// - Azure Blob: `az://` +/// +/// Returns "aws", "gcp", "azure", or "unknown". +pub fn detect_provider_from_uri(uri: &str) -> &'static str { + let Ok(url) = uri_to_url(uri) else { + return "unknown"; + }; + + match url.scheme() { + "s3" => "aws", + "gs" => "gcp", + "az" => "azure", + _ => "unknown", + } +} + +/// Check if credential vending is enabled. +/// +/// Returns true only if the `enabled` property is set to "true". +/// This expects properties with short names (prefix already stripped). +pub fn has_credential_vendor_config(properties: &HashMap) -> bool { + properties + .get(ENABLED) + .map(|v| v.eq_ignore_ascii_case("true")) + .unwrap_or(false) +} + +/// Create a credential vendor for the specified table location based on its URI scheme. +/// +/// This function automatically detects the cloud provider from the table location +/// and creates the appropriate credential vendor using the provided properties. +/// +/// # Arguments +/// +/// * `table_location` - The table URI to create a vendor for (e.g., "s3://bucket/path") +/// * `properties` - Configuration properties for credential vendors +/// +/// # Returns +/// +/// Returns `Some(vendor)` if the provider is detected and configured, `None` if: +/// - The provider cannot be detected from the URI (e.g., local file path) +/// - The required feature is not enabled for the detected provider +/// +/// # Errors +/// +/// Returns an error if the provider is detected but required configuration is missing: +/// - AWS: `credential_vendor.aws_role_arn` is required +/// - Azure: `credential_vendor.azure_account_name` is required +#[allow(unused_variables)] +pub async fn create_credential_vendor_for_location( + table_location: &str, + properties: &HashMap, +) -> Result>> { + let provider = detect_provider_from_uri(table_location); + + let vendor: Option> = match provider { + #[cfg(feature = "credential-vendor-aws")] + "aws" => create_aws_vendor(properties).await?, + + #[cfg(feature = "credential-vendor-gcp")] + "gcp" => create_gcp_vendor(properties).await?, + + #[cfg(feature = "credential-vendor-azure")] + "azure" => create_azure_vendor(properties)?, + + _ => None, + }; + + // Wrap with caching if enabled (default: true) + #[cfg(any( + feature = "credential-vendor-aws", + feature = "credential-vendor-azure", + feature = "credential-vendor-gcp" + ))] + if let Some(v) = vendor { + let cache_enabled = properties + .get(CACHE_ENABLED) + .map(|s| !s.eq_ignore_ascii_case("false")) + .unwrap_or(true); + + if cache_enabled { + return Ok(Some(Box::new(cache::CachingCredentialVendor::new(v)))); + } else { + return Ok(Some(v)); + } + } + + #[cfg(not(any( + feature = "credential-vendor-aws", + feature = "credential-vendor-azure", + feature = "credential-vendor-gcp" + )))] + let _ = vendor; + + Ok(None) +} + +/// Parse permission from properties, defaulting to Read +#[allow(dead_code)] +fn parse_permission(properties: &HashMap) -> VendedPermission { + properties + .get(PERMISSION) + .and_then(|s| s.parse().ok()) + .unwrap_or_default() +} + +/// Parse duration from properties using a vendor-specific key, defaulting to DEFAULT_CREDENTIAL_DURATION_MILLIS +#[allow(dead_code)] +fn parse_duration_millis(properties: &HashMap, key: &str) -> u64 { + properties + .get(key) + .and_then(|s| s.parse::().ok()) + .unwrap_or(DEFAULT_CREDENTIAL_DURATION_MILLIS) +} + +#[cfg(feature = "credential-vendor-aws")] +async fn create_aws_vendor( + properties: &HashMap, +) -> Result>> { + use aws::{AwsCredentialVendor, AwsCredentialVendorConfig}; + use lance_core::Error; + + // AWS requires role_arn to be configured + let role_arn = properties + .get(aws_props::ROLE_ARN) + .ok_or_else(|| Error::InvalidInput { + source: "AWS credential vending requires 'credential_vendor.aws_role_arn' to be set" + .into(), + location: snafu::location!(), + })?; + + let duration_millis = parse_duration_millis(properties, aws_props::DURATION_MILLIS); + + let permission = parse_permission(properties); + + let mut config = AwsCredentialVendorConfig::new(role_arn) + .with_duration_millis(duration_millis) + .with_permission(permission); + + if let Some(external_id) = properties.get(aws_props::EXTERNAL_ID) { + config = config.with_external_id(external_id); + } + if let Some(region) = properties.get(aws_props::REGION) { + config = config.with_region(region); + } + if let Some(session_name) = properties.get(aws_props::ROLE_SESSION_NAME) { + config = config.with_role_session_name(session_name); + } + + let vendor = AwsCredentialVendor::new(config).await?; + Ok(Some(Box::new(vendor))) +} + +#[cfg(feature = "credential-vendor-gcp")] +async fn create_gcp_vendor( + properties: &HashMap, +) -> Result>> { + use gcp::{GcpCredentialVendor, GcpCredentialVendorConfig}; + + let permission = parse_permission(properties); + + let mut config = GcpCredentialVendorConfig::new().with_permission(permission); + + if let Some(sa) = properties.get(gcp_props::SERVICE_ACCOUNT) { + config = config.with_service_account(sa); + } + + let vendor = GcpCredentialVendor::new(config).await?; + Ok(Some(Box::new(vendor))) +} + +#[cfg(feature = "credential-vendor-azure")] +fn create_azure_vendor( + properties: &HashMap, +) -> Result>> { + use azure::{AzureCredentialVendor, AzureCredentialVendorConfig}; + use lance_core::Error; + + // Azure requires account_name to be configured + let account_name = + properties + .get(azure_props::ACCOUNT_NAME) + .ok_or_else(|| { + Error::InvalidInput { + source: + "Azure credential vending requires 'credential_vendor.azure_account_name' to be set" + .into(), + location: snafu::location!(), + } + })?; + + let duration_millis = parse_duration_millis(properties, azure_props::DURATION_MILLIS); + let permission = parse_permission(properties); + + let mut config = AzureCredentialVendorConfig::new() + .with_account_name(account_name) + .with_duration_millis(duration_millis) + .with_permission(permission); + + if let Some(tenant_id) = properties.get(azure_props::TENANT_ID) { + config = config.with_tenant_id(tenant_id); + } + + let vendor = AzureCredentialVendor::new(config); + Ok(Some(Box::new(vendor))) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_detect_provider_from_uri() { + // AWS (supported scheme: s3://) + assert_eq!(detect_provider_from_uri("s3://bucket/path"), "aws"); + assert_eq!(detect_provider_from_uri("S3://bucket/path"), "aws"); + + // GCP (supported scheme: gs://) + assert_eq!(detect_provider_from_uri("gs://bucket/path"), "gcp"); + assert_eq!(detect_provider_from_uri("GS://bucket/path"), "gcp"); + + // Azure (supported scheme: az://) + assert_eq!(detect_provider_from_uri("az://container/path"), "azure"); + + // Unknown (unsupported schemes) + assert_eq!(detect_provider_from_uri("/local/path"), "unknown"); + assert_eq!(detect_provider_from_uri("file:///local/path"), "unknown"); + assert_eq!(detect_provider_from_uri("memory://test"), "unknown"); + // Hadoop-style schemes not supported by lance-io + assert_eq!(detect_provider_from_uri("s3a://bucket/path"), "unknown"); + assert_eq!( + detect_provider_from_uri("abfss://container@account.dfs.core.windows.net/path"), + "unknown" + ); + assert_eq!( + detect_provider_from_uri("wasbs://container@account.blob.core.windows.net/path"), + "unknown" + ); + } + + #[test] + fn test_vended_permission_from_str() { + // Valid values (case-insensitive) + assert_eq!( + "read".parse::().unwrap(), + VendedPermission::Read + ); + assert_eq!( + "READ".parse::().unwrap(), + VendedPermission::Read + ); + assert_eq!( + "write".parse::().unwrap(), + VendedPermission::Write + ); + assert_eq!( + "WRITE".parse::().unwrap(), + VendedPermission::Write + ); + assert_eq!( + "admin".parse::().unwrap(), + VendedPermission::Admin + ); + assert_eq!( + "Admin".parse::().unwrap(), + VendedPermission::Admin + ); + + // Invalid values should return error + let err = "invalid".parse::().unwrap_err(); + assert!(err.contains("Invalid permission")); + assert!(err.contains("invalid")); + + let err = "".parse::().unwrap_err(); + assert!(err.contains("Invalid permission")); + + let err = "readwrite".parse::().unwrap_err(); + assert!(err.contains("Invalid permission")); + } + + #[test] + fn test_vended_permission_display() { + assert_eq!(VendedPermission::Read.to_string(), "read"); + assert_eq!(VendedPermission::Write.to_string(), "write"); + assert_eq!(VendedPermission::Admin.to_string(), "admin"); + } + + #[test] + fn test_parse_permission_with_invalid_values() { + // Invalid permission should default to Read + let mut props = HashMap::new(); + props.insert(PERMISSION.to_string(), "invalid".to_string()); + assert_eq!(parse_permission(&props), VendedPermission::Read); + + // Empty permission should default to Read + props.insert(PERMISSION.to_string(), "".to_string()); + assert_eq!(parse_permission(&props), VendedPermission::Read); + + // Missing permission should default to Read + let empty_props: HashMap = HashMap::new(); + assert_eq!(parse_permission(&empty_props), VendedPermission::Read); + } + + #[test] + fn test_parse_duration_millis_with_invalid_values() { + const TEST_KEY: &str = "test_duration_millis"; + + // Invalid duration should default to DEFAULT_CREDENTIAL_DURATION_MILLIS + let mut props = HashMap::new(); + props.insert(TEST_KEY.to_string(), "not_a_number".to_string()); + assert_eq!( + parse_duration_millis(&props, TEST_KEY), + DEFAULT_CREDENTIAL_DURATION_MILLIS + ); + + // Negative number (parsed as u64 fails) + props.insert(TEST_KEY.to_string(), "-1000".to_string()); + assert_eq!( + parse_duration_millis(&props, TEST_KEY), + DEFAULT_CREDENTIAL_DURATION_MILLIS + ); + + // Empty string should default + props.insert(TEST_KEY.to_string(), "".to_string()); + assert_eq!( + parse_duration_millis(&props, TEST_KEY), + DEFAULT_CREDENTIAL_DURATION_MILLIS + ); + + // Missing duration should default + let empty_props: HashMap = HashMap::new(); + assert_eq!( + parse_duration_millis(&empty_props, TEST_KEY), + DEFAULT_CREDENTIAL_DURATION_MILLIS + ); + + // Valid duration should work + props.insert(TEST_KEY.to_string(), "7200000".to_string()); + assert_eq!(parse_duration_millis(&props, TEST_KEY), 7200000); + } + + #[test] + fn test_has_credential_vendor_config() { + // enabled = true + let mut props = HashMap::new(); + props.insert(ENABLED.to_string(), "true".to_string()); + assert!(has_credential_vendor_config(&props)); + + // enabled = TRUE (case-insensitive) + props.insert(ENABLED.to_string(), "TRUE".to_string()); + assert!(has_credential_vendor_config(&props)); + + // enabled = false + props.insert(ENABLED.to_string(), "false".to_string()); + assert!(!has_credential_vendor_config(&props)); + + // enabled = invalid value + props.insert(ENABLED.to_string(), "yes".to_string()); + assert!(!has_credential_vendor_config(&props)); + + // enabled missing + let empty_props: HashMap = HashMap::new(); + assert!(!has_credential_vendor_config(&empty_props)); + } + + #[test] + fn test_vended_credentials_debug_redacts_secrets() { + let mut storage_options = HashMap::new(); + storage_options.insert( + "aws_access_key_id".to_string(), + "AKIAIOSFODNN7EXAMPLE".to_string(), + ); + storage_options.insert( + "aws_secret_access_key".to_string(), + "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY".to_string(), + ); + storage_options.insert( + "aws_session_token".to_string(), + "FwoGZXIvYXdzE...".to_string(), + ); + + let creds = VendedCredentials::new(storage_options, 1234567890); + let debug_output = format!("{:?}", creds); + + // Should NOT contain actual secrets + assert!(!debug_output.contains("AKIAIOSFODNN7EXAMPLE")); + assert!(!debug_output.contains("wJalrXUtnFEMI")); + assert!(!debug_output.contains("FwoGZXIvYXdzE")); + + // Should contain redacted message + assert!(debug_output.contains("redacted")); + assert!(debug_output.contains("3 keys")); + + // Should contain expiration time + assert!(debug_output.contains("1234567890")); + } + + #[test] + fn test_vended_credentials_is_expired() { + // Create credentials that expired in the past + let past_millis = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() as u64 + - 1000; // 1 second ago + + let expired_creds = VendedCredentials::new(HashMap::new(), past_millis); + assert!(expired_creds.is_expired()); + + // Create credentials that expire in the future + let future_millis = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() as u64 + + 3600000; // 1 hour from now + + let valid_creds = VendedCredentials::new(HashMap::new(), future_millis); + assert!(!valid_creds.is_expired()); + } + + #[test] + fn test_redact_credential() { + // Long credential: shows first 8 and last 4 + assert_eq!(redact_credential("AKIAIOSFODNN7EXAMPLE"), "AKIAIOSF***MPLE"); + + // Exactly 16 chars: shows first 8 and last 4 + assert_eq!(redact_credential("1234567890123456"), "12345678***3456"); + + // Short credential (< 16 chars): shows only first few + assert_eq!(redact_credential("short1234567"), "short123***"); + assert_eq!(redact_credential("short123"), "short123***"); + assert_eq!(redact_credential("tiny"), "tiny***"); + assert_eq!(redact_credential("ab"), "ab***"); + assert_eq!(redact_credential("a"), "a***"); + + // Empty string + assert_eq!(redact_credential(""), "[empty]"); + + // Real-world examples + // AWS access key ID (20 chars) - shows AKIA + 4 more chars which helps identify the key + assert_eq!(redact_credential("AKIAIOSFODNN7EXAMPLE"), "AKIAIOSF***MPLE"); + + // GCP token (typically very long) + let long_token = "ya29.a0AfH6SMBx1234567890abcdefghijklmnopqrstuvwxyz"; + assert_eq!(redact_credential(long_token), "ya29.a0A***wxyz"); + + // Azure SAS token + let sas_token = "sv=2021-06-08&ss=b&srt=sco&sp=rwdlacuiytfx&se=2024-12-31"; + assert_eq!(redact_credential(sas_token), "sv=2021-***2-31"); + } +} diff --git a/rust/lance-namespace-impls/src/credentials/aws.rs b/rust/lance-namespace-impls/src/credentials/aws.rs new file mode 100644 index 00000000000..d9b363e37e0 --- /dev/null +++ b/rust/lance-namespace-impls/src/credentials/aws.rs @@ -0,0 +1,1152 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! AWS credential vending using STS AssumeRole. +//! +//! This module provides credential vending for AWS S3 storage by assuming +//! an IAM role using AWS STS (Security Token Service). + +use std::collections::HashMap; + +use async_trait::async_trait; +use aws_config::BehaviorVersion; +use aws_sdk_sts::Client as StsClient; +use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine}; +use lance_core::{Error, Result}; +use lance_io::object_store::uri_to_url; +use lance_namespace::models::Identity; +use log::{debug, info, warn}; +use sha2::{Digest, Sha256}; + +use super::{ + redact_credential, CredentialVendor, VendedCredentials, VendedPermission, + DEFAULT_CREDENTIAL_DURATION_MILLIS, +}; + +/// Configuration for AWS credential vending. +#[derive(Debug, Clone)] +pub struct AwsCredentialVendorConfig { + /// The IAM role ARN to assume. + /// Used for both AssumeRole (static/api_key) and AssumeRoleWithWebIdentity (auth_token). + pub role_arn: String, + + /// Optional external ID for the assume role request. + pub external_id: Option, + + /// Duration for vended credentials in milliseconds. + /// Default: 3600000 (1 hour). + /// AWS STS allows 900-43200 seconds (15 min - 12 hours). + /// Values outside this range will be clamped. + pub duration_millis: u64, + + /// Optional role session name. Defaults to "lance-credential-vending". + pub role_session_name: Option, + + /// Optional AWS region for the STS client. + pub region: Option, + + /// Permission level for vended credentials. + /// Default: Read (full read access) + /// Used to generate scoped IAM policy for all credential flows. + pub permission: VendedPermission, + + /// Salt for API key hashing. + /// Required when using API key authentication. + /// API keys are hashed as: SHA256(api_key + ":" + salt) + pub api_key_salt: Option, + + /// Map of SHA256(api_key + ":" + salt) -> permission level. + /// When an API key is provided, its hash is looked up in this map. + /// If found, the mapped permission is used instead of the default permission. + pub api_key_hash_permissions: HashMap, +} + +impl AwsCredentialVendorConfig { + /// Create a new config with the specified role ARN. + pub fn new(role_arn: impl Into) -> Self { + Self { + role_arn: role_arn.into(), + external_id: None, + duration_millis: DEFAULT_CREDENTIAL_DURATION_MILLIS, + role_session_name: None, + region: None, + permission: VendedPermission::default(), + api_key_salt: None, + api_key_hash_permissions: HashMap::new(), + } + } + + /// Set the external ID for the assume role request. + pub fn with_external_id(mut self, external_id: impl Into) -> Self { + self.external_id = Some(external_id.into()); + self + } + + /// Set the credential duration in milliseconds. + pub fn with_duration_millis(mut self, millis: u64) -> Self { + self.duration_millis = millis; + self + } + + /// Set the role session name. + pub fn with_role_session_name(mut self, name: impl Into) -> Self { + self.role_session_name = Some(name.into()); + self + } + + /// Set the AWS region for the STS client. + pub fn with_region(mut self, region: impl Into) -> Self { + self.region = Some(region.into()); + self + } + + /// Set the permission level for vended credentials. + pub fn with_permission(mut self, permission: VendedPermission) -> Self { + self.permission = permission; + self + } + + /// Set the API key salt for hashing. + pub fn with_api_key_salt(mut self, salt: impl Into) -> Self { + self.api_key_salt = Some(salt.into()); + self + } + + /// Add an API key hash to permission mapping. + pub fn with_api_key_hash_permission( + mut self, + key_hash: impl Into, + permission: VendedPermission, + ) -> Self { + self.api_key_hash_permissions + .insert(key_hash.into(), permission); + self + } + + /// Set the entire API key hash permissions map. + pub fn with_api_key_hash_permissions( + mut self, + permissions: HashMap, + ) -> Self { + self.api_key_hash_permissions = permissions; + self + } +} + +/// AWS credential vendor that uses STS AssumeRole. +#[derive(Debug)] +pub struct AwsCredentialVendor { + config: AwsCredentialVendorConfig, + sts_client: StsClient, +} + +impl AwsCredentialVendor { + /// Create a new AWS credential vendor with the specified configuration. + pub async fn new(config: AwsCredentialVendorConfig) -> Result { + let mut aws_config_loader = aws_config::defaults(BehaviorVersion::latest()); + + if let Some(ref region) = config.region { + aws_config_loader = aws_config_loader.region(aws_config::Region::new(region.clone())); + } + + let aws_config = aws_config_loader.load().await; + let sts_client = StsClient::new(&aws_config); + + Ok(Self { config, sts_client }) + } + + /// Create a new AWS credential vendor with an existing STS client. + pub fn with_sts_client(config: AwsCredentialVendorConfig, sts_client: StsClient) -> Self { + Self { config, sts_client } + } + + /// Parse an S3 URI to extract bucket and prefix. + fn parse_s3_uri(uri: &str) -> Result<(String, String)> { + let url = uri_to_url(uri)?; + + let bucket = url + .host_str() + .ok_or_else(|| Error::InvalidInput { + source: format!("S3 URI '{}' missing bucket", uri).into(), + location: snafu::location!(), + })? + .to_string(); + + let prefix = url.path().trim_start_matches('/').to_string(); + + Ok((bucket, prefix)) + } + + /// Build a scoped IAM policy for the specified location and permission level. + /// + /// Permission levels: + /// - `Read`: Full read access to all content (metadata, indices, data files) + /// - `Write`: Full read and write access (no delete) + /// - `Admin`: Full read, write, and delete access + fn build_policy(bucket: &str, prefix: &str, permission: VendedPermission) -> String { + let prefix_trimmed = prefix.trim_end_matches('/'); + let base_path = if prefix.is_empty() { + format!("arn:aws:s3:::{}/*", bucket) + } else { + format!("arn:aws:s3:::{}/{}/*", bucket, prefix_trimmed) + }; + let bucket_arn = format!("arn:aws:s3:::{}", bucket); + + let mut statements = vec![]; + + // List bucket permission (always needed) + statements.push(serde_json::json!({ + "Effect": "Allow", + "Action": "s3:ListBucket", + "Resource": bucket_arn, + "Condition": { + "StringLike": { + "s3:prefix": if prefix.is_empty() { + "*".to_string() + } else { + format!("{}/*", prefix_trimmed) + } + } + } + })); + + // Get bucket location (always needed) + statements.push(serde_json::json!({ + "Effect": "Allow", + "Action": "s3:GetBucketLocation", + "Resource": bucket_arn + })); + + // Read access (all permission levels have full read) + statements.push(serde_json::json!({ + "Effect": "Allow", + "Action": ["s3:GetObject", "s3:GetObjectVersion"], + "Resource": base_path + })); + + // Write access (Write and Admin) + if permission.can_write() { + statements.push(serde_json::json!({ + "Effect": "Allow", + "Action": "s3:PutObject", + "Resource": base_path + })); + } + + // Delete access (Admin only) + if permission.can_delete() { + statements.push(serde_json::json!({ + "Effect": "Allow", + "Action": "s3:DeleteObject", + "Resource": base_path + })); + } + + let policy = serde_json::json!({ + "Version": "2012-10-17", + "Statement": statements + }); + + policy.to_string() + } + + /// Hash an API key using SHA-256 with salt (Polaris pattern). + /// Format: SHA256(api_key + ":" + salt) as hex string. + pub fn hash_api_key(api_key: &str, salt: &str) -> String { + let mut hasher = Sha256::new(); + hasher.update(format!("{}:{}", api_key, salt)); + format!("{:x}", hasher.finalize()) + } + + /// Extract a session name from a JWT token (best effort, no validation). + /// Decodes the payload and extracts 'sub' or 'email' claim. + /// Falls back to "lance-web-identity" if parsing fails. + fn derive_session_name_from_token(token: &str) -> String { + // JWT format: header.payload.signature + let parts: Vec<&str> = token.split('.').collect(); + if parts.len() != 3 { + return "lance-web-identity".to_string(); + } + + // Decode the payload (second part) + let payload = match URL_SAFE_NO_PAD.decode(parts[1]) { + Ok(bytes) => bytes, + Err(_) => { + // Try standard base64 as fallback + match base64::engine::general_purpose::STANDARD_NO_PAD.decode(parts[1]) { + Ok(bytes) => bytes, + Err(_) => return "lance-web-identity".to_string(), + } + } + }; + + // Parse as JSON and extract 'sub' or 'email' + let json: serde_json::Value = match serde_json::from_slice(&payload) { + Ok(v) => v, + Err(_) => return "lance-web-identity".to_string(), + }; + + let subject = json + .get("sub") + .or_else(|| json.get("email")) + .and_then(|v| v.as_str()) + .unwrap_or("unknown"); + + // Sanitize for role session name (alphanumeric, =, @, -, .) + let sanitized: String = subject + .chars() + .filter(|c| c.is_alphanumeric() || *c == '=' || *c == '@' || *c == '-' || *c == '.') + .collect(); + + let session_name = format!("lance-{}", sanitized); + + // Cap to 64 chars (AWS limit) + if session_name.len() > 64 { + session_name[..64].to_string() + } else { + session_name + } + } + + /// Cap a session name to 64 characters (AWS limit). + fn cap_session_name(name: &str) -> String { + if name.len() > 64 { + name[..64].to_string() + } else { + name.to_string() + } + } + + /// Extract credentials from an STS Credentials response. + fn extract_credentials( + &self, + credentials: Option<&aws_sdk_sts::types::Credentials>, + bucket: &str, + prefix: &str, + permission: VendedPermission, + ) -> Result { + let credentials = credentials.ok_or_else(|| Error::IO { + source: Box::new(std::io::Error::other("STS response missing credentials")), + location: snafu::location!(), + })?; + + let access_key_id = credentials.access_key_id().to_string(); + let secret_access_key = credentials.secret_access_key().to_string(); + let session_token = credentials.session_token().to_string(); + + let expiration = credentials.expiration(); + let expires_at_millis = + (expiration.secs() as u64) * 1000 + (expiration.subsec_nanos() / 1_000_000) as u64; + + info!( + "AWS credentials vended: bucket={}, prefix={}, permission={}, expires_at={}, access_key_id={}", + bucket, prefix, permission, expires_at_millis, redact_credential(&access_key_id) + ); + + let mut storage_options = HashMap::new(); + storage_options.insert("aws_access_key_id".to_string(), access_key_id); + storage_options.insert("aws_secret_access_key".to_string(), secret_access_key); + storage_options.insert("aws_session_token".to_string(), session_token); + storage_options.insert( + "expires_at_millis".to_string(), + expires_at_millis.to_string(), + ); + + // Include region if configured + if let Some(ref region) = self.config.region { + storage_options.insert("aws_region".to_string(), region.clone()); + } + + Ok(VendedCredentials::new(storage_options, expires_at_millis)) + } + + /// Vend credentials using AssumeRoleWithWebIdentity (for auth_token). + async fn vend_with_web_identity( + &self, + bucket: &str, + prefix: &str, + auth_token: &str, + policy: &str, + ) -> Result { + let session_name = Self::derive_session_name_from_token(auth_token); + let duration_secs = self.config.duration_millis.div_ceil(1000).clamp(900, 43200) as i32; + + debug!( + "AWS AssumeRoleWithWebIdentity: role={}, session={}, permission={}", + self.config.role_arn, session_name, self.config.permission + ); + + let response = self + .sts_client + .assume_role_with_web_identity() + .role_arn(&self.config.role_arn) + .web_identity_token(auth_token) + .role_session_name(&session_name) + .policy(policy) + .duration_seconds(duration_secs) + .send() + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "AssumeRoleWithWebIdentity failed for role '{}': {}", + self.config.role_arn, e + ))), + location: snafu::location!(), + })?; + + self.extract_credentials( + response.credentials(), + bucket, + prefix, + self.config.permission, + ) + } + + /// Vend credentials using AssumeRole with API key validation. + async fn vend_with_api_key( + &self, + bucket: &str, + prefix: &str, + api_key: &str, + ) -> Result { + let salt = self + .config + .api_key_salt + .as_ref() + .ok_or_else(|| Error::InvalidInput { + source: "api_key_salt must be configured to use API key authentication".into(), + location: snafu::location!(), + })?; + + let key_hash = Self::hash_api_key(api_key, salt); + + // Look up permission from hash mapping + let permission = self + .config + .api_key_hash_permissions + .get(&key_hash) + .copied() + .ok_or_else(|| { + warn!( + "Invalid API key: hash {} not found in permissions map", + &key_hash[..8] + ); + Error::InvalidInput { + source: "Invalid API key".into(), + location: snafu::location!(), + } + })?; + + let policy = Self::build_policy(bucket, prefix, permission); + let session_name = Self::cap_session_name(&format!("lance-api-{}", &key_hash[..16])); + let duration_secs = self.config.duration_millis.div_ceil(1000).clamp(900, 43200) as i32; + + debug!( + "AWS AssumeRole with API key: role={}, session={}, permission={}", + self.config.role_arn, session_name, permission + ); + + let request = self + .sts_client + .assume_role() + .role_arn(&self.config.role_arn) + .role_session_name(&session_name) + .policy(&policy) + .duration_seconds(duration_secs) + .external_id(&key_hash); // Use hash as external_id + + let response = request.send().await.map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "AssumeRole with API key failed for role '{}': {}", + self.config.role_arn, e + ))), + location: snafu::location!(), + })?; + + self.extract_credentials(response.credentials(), bucket, prefix, permission) + } + + /// Vend credentials using AssumeRole with static configuration. + async fn vend_with_static_config( + &self, + bucket: &str, + prefix: &str, + policy: &str, + ) -> Result { + let role_session_name = self + .config + .role_session_name + .clone() + .unwrap_or_else(|| "lance-credential-vending".to_string()); + let role_session_name = Self::cap_session_name(&role_session_name); + + let duration_secs = self.config.duration_millis.div_ceil(1000).clamp(900, 43200) as i32; + + debug!( + "AWS AssumeRole (static): role={}, session={}, permission={}", + self.config.role_arn, role_session_name, self.config.permission + ); + + let mut request = self + .sts_client + .assume_role() + .role_arn(&self.config.role_arn) + .role_session_name(&role_session_name) + .policy(policy) + .duration_seconds(duration_secs); + + if let Some(ref external_id) = self.config.external_id { + request = request.external_id(external_id); + } + + let response = request.send().await.map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "AssumeRole failed for role '{}': {}", + self.config.role_arn, e + ))), + location: snafu::location!(), + })?; + + self.extract_credentials( + response.credentials(), + bucket, + prefix, + self.config.permission, + ) + } +} + +#[async_trait] +impl CredentialVendor for AwsCredentialVendor { + async fn vend_credentials( + &self, + table_location: &str, + identity: Option<&Identity>, + ) -> Result { + debug!( + "AWS credential vending: location={}, permission={}, has_identity={}", + table_location, + self.config.permission, + identity.is_some() + ); + + let (bucket, prefix) = Self::parse_s3_uri(table_location)?; + + match identity { + Some(id) if id.auth_token.is_some() => { + // Use AssumeRoleWithWebIdentity with configured permission + let policy = Self::build_policy(&bucket, &prefix, self.config.permission); + self.vend_with_web_identity( + &bucket, + &prefix, + id.auth_token.as_ref().unwrap(), + &policy, + ) + .await + } + Some(id) if id.api_key.is_some() => { + // Use AssumeRole with API key validation and mapped permission + self.vend_with_api_key(&bucket, &prefix, id.api_key.as_ref().unwrap()) + .await + } + Some(_) => { + // Identity provided but neither api_key nor auth_token set + Err(Error::InvalidInput { + source: "Identity provided but neither api_key nor auth_token is set".into(), + location: snafu::location!(), + }) + } + None => { + // Use AssumeRole with static configuration + let policy = Self::build_policy(&bucket, &prefix, self.config.permission); + self.vend_with_static_config(&bucket, &prefix, &policy) + .await + } + } + } + + fn provider_name(&self) -> &'static str { + "aws" + } + + fn permission(&self) -> VendedPermission { + self.config.permission + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_s3_uri() { + let (bucket, prefix) = AwsCredentialVendor::parse_s3_uri("s3://my-bucket/path/to/table") + .expect("should parse"); + assert_eq!(bucket, "my-bucket"); + assert_eq!(prefix, "path/to/table"); + + let (bucket, prefix) = + AwsCredentialVendor::parse_s3_uri("s3://my-bucket/").expect("should parse"); + assert_eq!(bucket, "my-bucket"); + assert_eq!(prefix, ""); + + let (bucket, prefix) = + AwsCredentialVendor::parse_s3_uri("s3://my-bucket").expect("should parse"); + assert_eq!(bucket, "my-bucket"); + assert_eq!(prefix, ""); + } + + #[test] + fn test_build_policy_read() { + let policy = + AwsCredentialVendor::build_policy("my-bucket", "path/to/table", VendedPermission::Read); + let parsed: serde_json::Value = serde_json::from_str(&policy).expect("valid json"); + + let statements = parsed["Statement"].as_array().expect("statements array"); + assert_eq!(statements.len(), 3); // ListBucket, GetBucketLocation, GetObject + + // Verify no write actions + for stmt in statements { + let actions = stmt["Action"].clone(); + let action_list: Vec = if actions.is_array() { + actions + .as_array() + .unwrap() + .iter() + .map(|a| a.as_str().unwrap().to_string()) + .collect() + } else { + vec![actions.as_str().unwrap().to_string()] + }; + assert!(!action_list.contains(&"s3:PutObject".to_string())); + assert!(!action_list.contains(&"s3:DeleteObject".to_string())); + } + } + + #[test] + fn test_build_policy_write() { + let policy = AwsCredentialVendor::build_policy( + "my-bucket", + "path/to/table", + VendedPermission::Write, + ); + let parsed: serde_json::Value = serde_json::from_str(&policy).expect("valid json"); + + let statements = parsed["Statement"].as_array().expect("statements array"); + // ListBucket, GetBucketLocation, GetObject, PutObject + assert_eq!(statements.len(), 4); + + // Verify PutObject is present + let write_stmt = statements + .iter() + .find(|s| { + let action = &s["Action"]; + action.as_str() == Some("s3:PutObject") + }) + .expect("should have PutObject statement"); + assert!(write_stmt["Effect"].as_str() == Some("Allow")); + + // Verify DeleteObject is NOT present (Write doesn't have delete) + let delete_stmt = statements.iter().find(|s| { + let action = &s["Action"]; + action.as_str() == Some("s3:DeleteObject") + }); + assert!(delete_stmt.is_none(), "Write should not have DeleteObject"); + + // Verify no Deny statements + let deny_stmt = statements + .iter() + .find(|s| s["Effect"].as_str() == Some("Deny")); + assert!(deny_stmt.is_none(), "Write should not have Deny statements"); + } + + #[test] + fn test_build_policy_admin() { + let policy = AwsCredentialVendor::build_policy( + "my-bucket", + "path/to/table", + VendedPermission::Admin, + ); + let parsed: serde_json::Value = serde_json::from_str(&policy).expect("valid json"); + + let statements = parsed["Statement"].as_array().expect("statements array"); + // ListBucket, GetBucketLocation, GetObject, PutObject, DeleteObject + assert_eq!(statements.len(), 5); + + // Verify read actions + let read_stmt = statements + .iter() + .find(|s| { + let actions = s["Action"].clone(); + if actions.is_array() { + actions + .as_array() + .unwrap() + .iter() + .any(|a| a.as_str().unwrap() == "s3:GetObject") + } else { + false + } + }) + .expect("should have read statement"); + assert!(read_stmt["Effect"].as_str() == Some("Allow")); + + // Verify PutObject + let write_stmt = statements + .iter() + .find(|s| s["Action"].as_str() == Some("s3:PutObject")) + .expect("should have PutObject statement"); + assert!(write_stmt["Effect"].as_str() == Some("Allow")); + + // Verify DeleteObject (Admin only) + let delete_stmt = statements + .iter() + .find(|s| s["Action"].as_str() == Some("s3:DeleteObject")) + .expect("should have DeleteObject statement"); + assert!(delete_stmt["Effect"].as_str() == Some("Allow")); + + // Verify no Deny statements + let deny_stmt = statements + .iter() + .find(|s| s["Effect"].as_str() == Some("Deny")); + assert!(deny_stmt.is_none(), "Admin should not have Deny statements"); + } + + #[test] + fn test_config_builder() { + let config = AwsCredentialVendorConfig::new("arn:aws:iam::123456789012:role/MyRole") + .with_external_id("my-external-id") + .with_duration_millis(7200000) + .with_role_session_name("my-session") + .with_region("us-west-2"); + + assert_eq!(config.role_arn, "arn:aws:iam::123456789012:role/MyRole"); + assert_eq!(config.external_id, Some("my-external-id".to_string())); + assert_eq!(config.duration_millis, 7200000); + assert_eq!(config.role_session_name, Some("my-session".to_string())); + assert_eq!(config.region, Some("us-west-2".to_string())); + } + + // ============================================================================ + // Integration Tests + // ============================================================================ + + /// Integration tests for AWS credential vending. + /// + /// These tests require: + /// - Valid AWS credentials (via environment, IAM role, or credential file) + /// - The `LANCE_TEST_AWS_ROLE_ARN` environment variable set to a role ARN that + /// can be assumed by the current credentials + /// - Access to the S3 bucket `jack-lancedb-devland-us-east-1` + /// + /// Run with: `cargo test --features credential-vendor-aws -- --ignored` + #[cfg(test)] + mod integration { + use super::*; + use crate::DirectoryNamespaceBuilder; + use arrow::array::{Int32Array, StringArray}; + use arrow::datatypes::{DataType, Field, Schema}; + use arrow::ipc::writer::StreamWriter; + use arrow::record_batch::RecordBatch; + use bytes::Bytes; + use lance_namespace::models::*; + use lance_namespace::LanceNamespace; + use std::sync::Arc; + + const TEST_BUCKET: &str = "jack-lancedb-devland-us-east-1"; + + /// Helper to create Arrow IPC data for testing + fn create_test_arrow_data() -> Bytes { + let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + ]); + + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec!["alice", "bob", "charlie"])), + ], + ) + .unwrap(); + + let mut buffer = Vec::new(); + { + let mut writer = StreamWriter::try_new(&mut buffer, &batch.schema()).unwrap(); + writer.write(&batch).unwrap(); + writer.finish().unwrap(); + } + + Bytes::from(buffer) + } + + /// Generate a unique test path for each test run to avoid conflicts + fn unique_test_path() -> String { + let timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis(); + format!("lance-test/credential-vending-{}", timestamp) + } + + /// Get the role ARN from environment variable + fn get_test_role_arn() -> Option { + std::env::var("LANCE_TEST_AWS_ROLE_ARN").ok() + } + + #[tokio::test] + #[ignore = "requires AWS credentials and LANCE_TEST_AWS_ROLE_ARN env var"] + async fn test_aws_credential_vending_basic() { + let role_arn = get_test_role_arn() + .expect("LANCE_TEST_AWS_ROLE_ARN must be set for integration tests"); + + let test_path = unique_test_path(); + let table_location = format!("s3://{}/{}/test_table", TEST_BUCKET, test_path); + + // Test Read permission + let read_config = AwsCredentialVendorConfig::new(&role_arn) + .with_duration_millis(900_000) // 15 minutes (minimum) + .with_region("us-east-1") + .with_permission(VendedPermission::Read); + + let read_vendor = AwsCredentialVendor::new(read_config) + .await + .expect("should create read vendor"); + + let read_creds = read_vendor + .vend_credentials(&table_location, None) + .await + .expect("should vend read credentials"); + + assert!( + read_creds.storage_options.contains_key("aws_access_key_id"), + "should have access key id" + ); + assert!( + read_creds + .storage_options + .contains_key("aws_secret_access_key"), + "should have secret access key" + ); + assert!( + read_creds.storage_options.contains_key("aws_session_token"), + "should have session token" + ); + assert!( + !read_creds.is_expired(), + "credentials should not be expired" + ); + assert_eq!( + read_vendor.permission(), + VendedPermission::Read, + "permission should be Read" + ); + + // Test Admin permission + let admin_config = AwsCredentialVendorConfig::new(&role_arn) + .with_duration_millis(900_000) + .with_region("us-east-1") + .with_permission(VendedPermission::Admin); + + let admin_vendor = AwsCredentialVendor::new(admin_config) + .await + .expect("should create admin vendor"); + + let admin_creds = admin_vendor + .vend_credentials(&table_location, None) + .await + .expect("should vend admin credentials"); + + assert!( + admin_creds + .storage_options + .contains_key("aws_access_key_id"), + "should have access key id" + ); + assert!( + !admin_creds.is_expired(), + "credentials should not be expired" + ); + assert_eq!( + admin_vendor.permission(), + VendedPermission::Admin, + "permission should be Admin" + ); + } + + #[tokio::test] + #[ignore = "requires AWS credentials and LANCE_TEST_AWS_ROLE_ARN env var"] + async fn test_directory_namespace_with_aws_credential_vending() { + let role_arn = get_test_role_arn() + .expect("LANCE_TEST_AWS_ROLE_ARN must be set for integration tests"); + + let test_path = unique_test_path(); + let root = format!("s3://{}/{}", TEST_BUCKET, test_path); + + // Build DirectoryNamespace with credential vending using short property names + let namespace = DirectoryNamespaceBuilder::new(&root) + .manifest_enabled(true) + .credential_vendor_property("enabled", "true") + .credential_vendor_property("aws_role_arn", &role_arn) + .credential_vendor_property("aws_duration_millis", "900000") // 15 minutes + .credential_vendor_property("aws_region", "us-east-1") + .credential_vendor_property("permission", "admin") + .build() + .await + .expect("should build namespace"); + + // Create a child namespace + let create_ns_req = CreateNamespaceRequest { + id: Some(vec!["test_ns".to_string()]), + ..Default::default() + }; + namespace + .create_namespace(create_ns_req) + .await + .expect("should create namespace"); + + // Create a table with data + let table_data = create_test_arrow_data(); + let create_table_req = CreateTableRequest { + id: Some(vec!["test_ns".to_string(), "test_table".to_string()]), + mode: Some("Create".to_string()), + ..Default::default() + }; + let create_response = namespace + .create_table(create_table_req, table_data) + .await + .expect("should create table"); + + assert!( + create_response.location.is_some(), + "should have location in response" + ); + assert_eq!(create_response.version, Some(1), "should be version 1"); + + // Describe the table (this should use vended credentials) + let describe_req = DescribeTableRequest { + id: Some(vec!["test_ns".to_string(), "test_table".to_string()]), + ..Default::default() + }; + let describe_response = namespace + .describe_table(describe_req) + .await + .expect("should describe table"); + + assert!(describe_response.location.is_some(), "should have location"); + assert!( + describe_response.storage_options.is_some(), + "should have storage_options with vended credentials" + ); + + let storage_options = describe_response.storage_options.unwrap(); + assert!( + storage_options.contains_key("aws_access_key_id"), + "should have vended aws_access_key_id" + ); + assert!( + storage_options.contains_key("aws_secret_access_key"), + "should have vended aws_secret_access_key" + ); + assert!( + storage_options.contains_key("aws_session_token"), + "should have vended aws_session_token" + ); + assert!( + storage_options.contains_key("expires_at_millis"), + "should have expires_at_millis" + ); + + // Verify expiration is in the future + let expires_at: u64 = storage_options + .get("expires_at_millis") + .unwrap() + .parse() + .expect("should parse expires_at_millis"); + let now_millis = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() as u64; + assert!( + expires_at > now_millis, + "expiration should be in the future" + ); + + // List tables to verify the table was created + let list_req = ListTablesRequest { + id: Some(vec!["test_ns".to_string()]), + ..Default::default() + }; + let list_response = namespace + .list_tables(list_req) + .await + .expect("should list tables"); + assert!( + list_response.tables.contains(&"test_table".to_string()), + "should contain test_table" + ); + + // Clean up: drop the table + let drop_req = DropTableRequest { + id: Some(vec!["test_ns".to_string(), "test_table".to_string()]), + ..Default::default() + }; + namespace + .drop_table(drop_req) + .await + .expect("should drop table"); + + // Clean up: drop the namespace + let mut drop_ns_req = DropNamespaceRequest::new(); + drop_ns_req.id = Some(vec!["test_ns".to_string()]); + namespace + .drop_namespace(drop_ns_req) + .await + .expect("should drop namespace"); + } + + #[tokio::test] + #[ignore = "requires AWS credentials and LANCE_TEST_AWS_ROLE_ARN env var"] + async fn test_credential_refresh_on_expiration() { + let role_arn = get_test_role_arn() + .expect("LANCE_TEST_AWS_ROLE_ARN must be set for integration tests"); + + let test_path = unique_test_path(); + let table_location = format!("s3://{}/{}/refresh_test", TEST_BUCKET, test_path); + + // Create vendor with minimum duration and Admin permission + let config = AwsCredentialVendorConfig::new(&role_arn) + .with_duration_millis(900_000) // 15 minutes + .with_region("us-east-1") + .with_permission(VendedPermission::Admin); + + let vendor = AwsCredentialVendor::new(config) + .await + .expect("should create vendor"); + + // Vend credentials multiple times to verify consistent behavior + let creds1 = vendor + .vend_credentials(&table_location, None) + .await + .expect("should vend credentials first time"); + + let creds2 = vendor + .vend_credentials(&table_location, None) + .await + .expect("should vend credentials second time"); + + // Both should be valid (not expired) + assert!(!creds1.is_expired(), "first credentials should be valid"); + assert!(!creds2.is_expired(), "second credentials should be valid"); + + // Both should have access keys (they may be different due to new STS calls) + assert!( + creds1.storage_options.contains_key("aws_access_key_id"), + "first creds should have access key" + ); + assert!( + creds2.storage_options.contains_key("aws_access_key_id"), + "second creds should have access key" + ); + } + + #[tokio::test] + #[ignore = "requires AWS credentials and LANCE_TEST_AWS_ROLE_ARN env var"] + async fn test_scoped_policy_permissions() { + let role_arn = get_test_role_arn() + .expect("LANCE_TEST_AWS_ROLE_ARN must be set for integration tests"); + + let test_path = unique_test_path(); + + // Create two different table locations + let table1_location = format!("s3://{}/{}/table1", TEST_BUCKET, test_path); + let table2_location = format!("s3://{}/{}/table2", TEST_BUCKET, test_path); + + let config = AwsCredentialVendorConfig::new(&role_arn) + .with_duration_millis(900_000) + .with_region("us-east-1") + .with_permission(VendedPermission::Admin); + + let vendor = AwsCredentialVendor::new(config) + .await + .expect("should create vendor"); + + // Vend credentials for table1 + let creds1 = vendor + .vend_credentials(&table1_location, None) + .await + .expect("should vend credentials for table1"); + + // Vend credentials for table2 + let creds2 = vendor + .vend_credentials(&table2_location, None) + .await + .expect("should vend credentials for table2"); + + // Both should be valid + assert!(!creds1.is_expired(), "table1 credentials should be valid"); + assert!(!creds2.is_expired(), "table2 credentials should be valid"); + + // The credentials are scoped to their respective paths via IAM policy + // (the policy restricts access to specific S3 paths) + } + + #[tokio::test] + #[ignore = "requires AWS credentials and LANCE_TEST_AWS_ROLE_ARN env var"] + async fn test_from_properties_builder() { + let role_arn = get_test_role_arn() + .expect("LANCE_TEST_AWS_ROLE_ARN must be set for integration tests"); + + let test_path = unique_test_path(); + let root = format!("s3://{}/{}", TEST_BUCKET, test_path); + + // Build namespace using from_properties (simulating config from external source) + // Properties use the "credential_vendor." prefix which gets stripped + let mut properties = HashMap::new(); + properties.insert("root".to_string(), root.clone()); + properties.insert("manifest_enabled".to_string(), "true".to_string()); + properties.insert("credential_vendor.enabled".to_string(), "true".to_string()); + properties.insert( + "credential_vendor.aws_role_arn".to_string(), + role_arn.clone(), + ); + properties.insert( + "credential_vendor.aws_duration_millis".to_string(), + "900000".to_string(), + ); + properties.insert( + "credential_vendor.aws_region".to_string(), + "us-east-1".to_string(), + ); + properties.insert( + "credential_vendor.permission".to_string(), + "admin".to_string(), + ); + + let namespace = DirectoryNamespaceBuilder::from_properties(properties, None) + .expect("should parse properties") + .build() + .await + .expect("should build namespace"); + + // Verify namespace works + let create_ns_req = CreateNamespaceRequest { + id: Some(vec!["props_test".to_string()]), + ..Default::default() + }; + namespace + .create_namespace(create_ns_req) + .await + .expect("should create namespace"); + + // Clean up + let mut drop_ns_req = DropNamespaceRequest::new(); + drop_ns_req.id = Some(vec!["props_test".to_string()]); + namespace + .drop_namespace(drop_ns_req) + .await + .expect("should drop namespace"); + } + } +} diff --git a/rust/lance-namespace-impls/src/credentials/azure.rs b/rust/lance-namespace-impls/src/credentials/azure.rs new file mode 100644 index 00000000000..75a711b7448 --- /dev/null +++ b/rust/lance-namespace-impls/src/credentials/azure.rs @@ -0,0 +1,979 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Azure credential vending using SAS tokens. +//! +//! This module provides credential vending for Azure Blob Storage by generating +//! SAS (Shared Access Signature) tokens with user delegation keys. + +use std::collections::HashMap; +use std::sync::Arc; + +use async_trait::async_trait; +use azure_core::auth::TokenCredential; +use azure_identity::DefaultAzureCredential; +use azure_storage::prelude::*; +use azure_storage::shared_access_signature::service_sas::{BlobSharedAccessSignature, SasKey}; +use azure_storage_blobs::prelude::*; +use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine}; +use lance_core::{Error, Result}; +use lance_io::object_store::uri_to_url; +use lance_namespace::models::Identity; +use log::{debug, info, warn}; +use sha2::{Digest, Sha256}; + +use super::{ + redact_credential, CredentialVendor, VendedCredentials, VendedPermission, + DEFAULT_CREDENTIAL_DURATION_MILLIS, +}; + +/// Configuration for Azure credential vending. +#[derive(Debug, Clone)] +pub struct AzureCredentialVendorConfig { + /// Optional tenant ID for authentication. + pub tenant_id: Option, + + /// Storage account name. Required for credential vending. + pub account_name: Option, + + /// Duration for vended credentials in milliseconds. + /// Default: 3600000 (1 hour). Azure allows up to 7 days for SAS tokens. + pub duration_millis: u64, + + /// Permission level for vended credentials. + /// Default: Read (full read access) + /// Used to generate SAS permissions for all credential flows. + pub permission: VendedPermission, + + /// Client ID of the Azure AD App Registration for Workload Identity Federation. + /// Required when using auth_token identity for OIDC token exchange. + pub federated_client_id: Option, + + /// Salt for API key hashing. + /// Required when using API key authentication. + /// API keys are hashed as: SHA256(api_key + ":" + salt) + pub api_key_salt: Option, + + /// Map of SHA256(api_key + ":" + salt) -> permission level. + /// When an API key is provided, its hash is looked up in this map. + /// If found, the mapped permission is used instead of the default permission. + pub api_key_hash_permissions: HashMap, +} + +impl Default for AzureCredentialVendorConfig { + fn default() -> Self { + Self { + tenant_id: None, + account_name: None, + duration_millis: DEFAULT_CREDENTIAL_DURATION_MILLIS, + permission: VendedPermission::default(), + federated_client_id: None, + api_key_salt: None, + api_key_hash_permissions: HashMap::new(), + } + } +} + +impl AzureCredentialVendorConfig { + /// Create a new default config. + pub fn new() -> Self { + Self::default() + } + + /// Set the tenant ID. + pub fn with_tenant_id(mut self, tenant_id: impl Into) -> Self { + self.tenant_id = Some(tenant_id.into()); + self + } + + /// Set the storage account name. + pub fn with_account_name(mut self, account_name: impl Into) -> Self { + self.account_name = Some(account_name.into()); + self + } + + /// Set the credential duration in milliseconds. + pub fn with_duration_millis(mut self, millis: u64) -> Self { + self.duration_millis = millis; + self + } + + /// Set the permission level for vended credentials. + pub fn with_permission(mut self, permission: VendedPermission) -> Self { + self.permission = permission; + self + } + + /// Set the federated client ID for Workload Identity Federation. + pub fn with_federated_client_id(mut self, client_id: impl Into) -> Self { + self.federated_client_id = Some(client_id.into()); + self + } + + /// Set the API key salt for hashing. + pub fn with_api_key_salt(mut self, salt: impl Into) -> Self { + self.api_key_salt = Some(salt.into()); + self + } + + /// Add an API key hash to permission mapping. + pub fn with_api_key_hash_permission( + mut self, + key_hash: impl Into, + permission: VendedPermission, + ) -> Self { + self.api_key_hash_permissions + .insert(key_hash.into(), permission); + self + } + + /// Set the entire API key hash permissions map. + pub fn with_api_key_hash_permissions( + mut self, + permissions: HashMap, + ) -> Self { + self.api_key_hash_permissions = permissions; + self + } +} + +/// Azure credential vendor that generates SAS tokens. +#[derive(Debug)] +pub struct AzureCredentialVendor { + config: AzureCredentialVendorConfig, + http_client: reqwest::Client, +} + +impl AzureCredentialVendor { + /// Create a new Azure credential vendor with the specified configuration. + pub fn new(config: AzureCredentialVendorConfig) -> Self { + Self { + config, + http_client: reqwest::Client::new(), + } + } + + /// Hash an API key using SHA-256 with salt (Polaris pattern). + /// Format: SHA256(api_key + ":" + salt) as hex string. + pub fn hash_api_key(api_key: &str, salt: &str) -> String { + let mut hasher = Sha256::new(); + hasher.update(format!("{}:{}", api_key, salt)); + format!("{:x}", hasher.finalize()) + } + + /// Extract a session name from a JWT token (best effort, no validation). + /// Decodes the payload and extracts 'sub' or 'email' claim. + /// Falls back to "lance-azure-identity" if parsing fails. + fn derive_session_name_from_token(token: &str) -> String { + // JWT format: header.payload.signature + let parts: Vec<&str> = token.split('.').collect(); + if parts.len() != 3 { + return "lance-azure-identity".to_string(); + } + + // Decode the payload (second part) + let payload = match URL_SAFE_NO_PAD.decode(parts[1]) { + Ok(bytes) => bytes, + Err(_) => { + // Try standard base64 as fallback + match base64::engine::general_purpose::STANDARD_NO_PAD.decode(parts[1]) { + Ok(bytes) => bytes, + Err(_) => return "lance-azure-identity".to_string(), + } + } + }; + + // Parse as JSON and extract 'sub' or 'email' + let json: serde_json::Value = match serde_json::from_slice(&payload) { + Ok(v) => v, + Err(_) => return "lance-azure-identity".to_string(), + }; + + let subject = json + .get("sub") + .or_else(|| json.get("email")) + .and_then(|v| v.as_str()) + .unwrap_or("unknown"); + + // Sanitize: keep only alphanumeric, @, -, . + let sanitized: String = subject + .chars() + .filter(|c| c.is_alphanumeric() || *c == '@' || *c == '-' || *c == '.') + .collect(); + + format!("lance-{}", sanitized) + } + + /// Build SAS permissions based on the VendedPermission level. + /// + /// - Read: read + list + /// - Write: read + list + write + add + create + /// - Admin: read + list + write + add + create + delete + #[allow(clippy::field_reassign_with_default)] + fn build_sas_permissions(permission: VendedPermission) -> BlobSasPermissions { + let mut p = BlobSasPermissions::default(); + + // All permission levels have read access + p.read = true; + p.list = true; + + // Write and Admin have write access + if permission.can_write() { + p.write = true; + p.add = true; + p.create = true; + } + + // Admin has delete access + if permission.can_delete() { + p.delete = true; + } + + p + } + + /// Generate a SAS token for the specified container. + async fn generate_sas_token(&self, account: &str, container: &str) -> Result<(String, u64)> { + let credential = + DefaultAzureCredential::create(azure_identity::TokenCredentialOptions::default()) + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to create Azure credentials: {}", + e + ))), + location: snafu::location!(), + })?; + + let credential: Arc = Arc::new(credential); + + let blob_service_client = BlobServiceClient::new(account, credential.clone()); + + // Calculate times using time crate (which Azure SDK uses) + let now = time::OffsetDateTime::now_utc(); + let duration_millis = self.config.duration_millis as i64; + let end_time = now + time::Duration::milliseconds(duration_millis); + + // Azure limits user delegation key to 7 days + let max_key_end = now + time::Duration::days(7) - time::Duration::seconds(60); + let key_end_time = if end_time > max_key_end { + max_key_end + } else { + end_time + }; + + // Get user delegation key (note: typo in the library method name) + let user_delegation_key = blob_service_client + .get_user_deligation_key(now, key_end_time) + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to get user delegation key for account '{}': {}", + account, e + ))), + location: snafu::location!(), + })?; + + let permissions = Self::build_sas_permissions(self.config.permission); + + // Generate SAS token for the container + let container_client = blob_service_client.container_client(container); + + let sas_token = container_client + .user_delegation_shared_access_signature( + permissions, + &user_delegation_key.user_deligation_key, + ) + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to generate SAS token for container '{}': {}", + container, e + ))), + location: snafu::location!(), + })?; + + let expires_at_millis = + (end_time.unix_timestamp() * 1000 + end_time.millisecond() as i64) as u64; + + let token = sas_token.token().map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to get SAS token: {}", + e + ))), + location: snafu::location!(), + })?; + + Ok((token, expires_at_millis)) + } + + /// Generate a SAS token with a specific permission level. + async fn generate_sas_token_with_permission( + &self, + account: &str, + container: &str, + permission: VendedPermission, + ) -> Result<(String, u64)> { + let credential = + DefaultAzureCredential::create(azure_identity::TokenCredentialOptions::default()) + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to create Azure credentials: {}", + e + ))), + location: snafu::location!(), + })?; + + let credential: Arc = Arc::new(credential); + let blob_service_client = BlobServiceClient::new(account, credential.clone()); + + let now = time::OffsetDateTime::now_utc(); + let duration_millis = self.config.duration_millis as i64; + let end_time = now + time::Duration::milliseconds(duration_millis); + + let max_key_end = now + time::Duration::days(7) - time::Duration::seconds(60); + let key_end_time = if end_time > max_key_end { + max_key_end + } else { + end_time + }; + + let user_delegation_key = blob_service_client + .get_user_deligation_key(now, key_end_time) + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to get user delegation key for account '{}': {}", + account, e + ))), + location: snafu::location!(), + })?; + + let permissions = Self::build_sas_permissions(permission); + let container_client = blob_service_client.container_client(container); + + let sas_token = container_client + .user_delegation_shared_access_signature( + permissions, + &user_delegation_key.user_deligation_key, + ) + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to generate SAS token for container '{}': {}", + container, e + ))), + location: snafu::location!(), + })?; + + let expires_at_millis = + (end_time.unix_timestamp() * 1000 + end_time.millisecond() as i64) as u64; + + let token = sas_token.token().map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to get SAS token: {}", + e + ))), + location: snafu::location!(), + })?; + + Ok((token, expires_at_millis)) + } + + /// Generate a directory-scoped SAS token. + /// + /// Unlike container-level SAS tokens, this restricts access to a specific directory + /// path within the container. This is more secure for multi-tenant scenarios. + /// + /// # Arguments + /// * `account` - Storage account name + /// * `container` - Container name + /// * `path` - Directory path within the container (e.g., "tenant-a/tables/my-table") + /// * `permission` - Permission level for the SAS token + async fn generate_directory_sas_token( + &self, + account: &str, + container: &str, + path: &str, + permission: VendedPermission, + ) -> Result<(String, u64)> { + let credential = + DefaultAzureCredential::create(azure_identity::TokenCredentialOptions::default()) + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to create Azure credentials: {}", + e + ))), + location: snafu::location!(), + })?; + + let credential: Arc = Arc::new(credential); + let blob_service_client = BlobServiceClient::new(account, credential.clone()); + + let now = time::OffsetDateTime::now_utc(); + let duration_millis = self.config.duration_millis as i64; + let end_time = now + time::Duration::milliseconds(duration_millis); + + let max_key_end = now + time::Duration::days(7) - time::Duration::seconds(60); + let key_end_time = if end_time > max_key_end { + max_key_end + } else { + end_time + }; + + let user_delegation_key = blob_service_client + .get_user_deligation_key(now, key_end_time) + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to get user delegation key for account '{}': {}", + account, e + ))), + location: snafu::location!(), + })?; + + // Normalize path: remove leading/trailing slashes + let normalized_path = path.trim_matches('/'); + let depth = if normalized_path.is_empty() { + 0 + } else { + normalized_path.split('/').count() + }; + + // Build canonical resource path for directory-level SAS + let canonical_resource = format!("/blob/{}/{}/{}", account, container, normalized_path); + + // Convert user delegation key to SasKey + let sas_key = SasKey::UserDelegationKey(user_delegation_key.user_deligation_key); + + let permissions = Self::build_sas_permissions(permission); + + // Create directory-scoped SAS signature + let sas = BlobSharedAccessSignature::new( + sas_key, + canonical_resource, + permissions, + end_time, + BlobSignedResource::Directory, + ) + .signed_directory_depth(depth as u8); + + let token = sas.token().map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to generate directory SAS token: {}", + e + ))), + location: snafu::location!(), + })?; + + let expires_at_millis = + (end_time.unix_timestamp() * 1000 + end_time.millisecond() as i64) as u64; + + info!( + "Azure directory-scoped SAS generated: account={}, container={}, path={}, depth={}, permission={}", + account, container, normalized_path, depth, permission + ); + + Ok((token, expires_at_millis)) + } + + /// Exchange an OIDC token for Azure AD access token using Workload Identity Federation. + /// + /// This requires: + /// 1. An Azure AD App Registration with Federated Credentials configured + /// 2. The OIDC token's issuer and subject to match the Federated Credential configuration + async fn exchange_oidc_for_azure_token(&self, oidc_token: &str) -> Result { + let tenant_id = self + .config + .tenant_id + .as_ref() + .ok_or_else(|| Error::InvalidInput { + source: "azure_tenant_id must be configured for OIDC token exchange".into(), + location: snafu::location!(), + })?; + + let client_id = + self.config + .federated_client_id + .as_ref() + .ok_or_else(|| Error::InvalidInput { + source: "azure_federated_client_id must be configured for OIDC token exchange" + .into(), + location: snafu::location!(), + })?; + + let token_url = format!( + "https://login.microsoftonline.com/{}/oauth2/v2.0/token", + tenant_id + ); + + let params = [ + ("grant_type", "client_credentials"), + ( + "client_assertion_type", + "urn:ietf:params:oauth:client-assertion-type:jwt-bearer", + ), + ("client_assertion", oidc_token), + ("client_id", client_id), + ("scope", "https://storage.azure.com/.default"), + ]; + + let response = self + .http_client + .post(&token_url) + .form(¶ms) + .send() + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to exchange OIDC token for Azure AD token: {}", + e + ))), + location: snafu::location!(), + })?; + + if !response.status().is_success() { + let status = response.status(); + let body = response.text().await.unwrap_or_default(); + return Err(Error::IO { + source: Box::new(std::io::Error::other(format!( + "Azure AD token exchange failed with status {}: {}", + status, body + ))), + location: snafu::location!(), + }); + } + + let token_response: serde_json::Value = response.json().await.map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to parse Azure AD token response: {}", + e + ))), + location: snafu::location!(), + })?; + + token_response + .get("access_token") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .ok_or_else(|| Error::IO { + source: Box::new(std::io::Error::other( + "Azure AD token response missing access_token", + )), + location: snafu::location!(), + }) + } + + /// Generate a SAS token using a federated Azure AD token. + /// + /// Uses directory-scoped SAS when path is provided, container-level otherwise. + async fn generate_sas_with_azure_token( + &self, + azure_token: &str, + account: &str, + container: &str, + path: &str, + permission: VendedPermission, + ) -> Result<(String, u64)> { + // Create a custom TokenCredential that uses our Azure AD token + let credential = FederatedTokenCredential::new(azure_token.to_string()); + let credential: Arc = Arc::new(credential); + + let blob_service_client = BlobServiceClient::new(account, credential.clone()); + + let now = time::OffsetDateTime::now_utc(); + let duration_millis = self.config.duration_millis as i64; + let end_time = now + time::Duration::milliseconds(duration_millis); + + let max_key_end = now + time::Duration::days(7) - time::Duration::seconds(60); + let key_end_time = if end_time > max_key_end { + max_key_end + } else { + end_time + }; + + let user_delegation_key = blob_service_client + .get_user_deligation_key(now, key_end_time) + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to get user delegation key with federated token: {}", + e + ))), + location: snafu::location!(), + })?; + + let permissions = Self::build_sas_permissions(permission); + + let expires_at_millis = + (end_time.unix_timestamp() * 1000 + end_time.millisecond() as i64) as u64; + + // Use directory-scoped SAS when path is provided + let normalized_path = path.trim_matches('/'); + let token = if normalized_path.is_empty() { + // Container-level SAS + let container_client = blob_service_client.container_client(container); + let sas_token = container_client + .user_delegation_shared_access_signature( + permissions, + &user_delegation_key.user_deligation_key, + ) + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to generate SAS token with federated token: {}", + e + ))), + location: snafu::location!(), + })?; + + sas_token.token().map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to get SAS token: {}", + e + ))), + location: snafu::location!(), + })? + } else { + // Directory-scoped SAS + let depth = normalized_path.split('/').count(); + let canonical_resource = format!("/blob/{}/{}/{}", account, container, normalized_path); + let sas_key = SasKey::UserDelegationKey(user_delegation_key.user_deligation_key); + + let sas = BlobSharedAccessSignature::new( + sas_key, + canonical_resource, + permissions, + end_time, + BlobSignedResource::Directory, + ) + .signed_directory_depth(depth as u8); + + sas.token().map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to generate directory SAS token with federated token: {}", + e + ))), + location: snafu::location!(), + })? + }; + + Ok((token, expires_at_millis)) + } + + /// Vend credentials using Workload Identity Federation (for auth_token). + async fn vend_with_web_identity( + &self, + account: &str, + container: &str, + path: &str, + auth_token: &str, + ) -> Result { + let session_name = Self::derive_session_name_from_token(auth_token); + debug!( + "Azure vend_with_web_identity: account={}, container={}, path={}, session={}", + account, container, path, session_name + ); + + // Exchange OIDC token for Azure AD token + let azure_token = self.exchange_oidc_for_azure_token(auth_token).await?; + + // Generate SAS token using the Azure AD token + // Use directory-scoped SAS when path is provided + let (sas_token, expires_at_millis) = self + .generate_sas_with_azure_token( + &azure_token, + account, + container, + path, + self.config.permission, + ) + .await?; + + let mut storage_options = HashMap::new(); + storage_options.insert("azure_storage_sas_token".to_string(), sas_token.clone()); + storage_options.insert( + "azure_storage_account_name".to_string(), + account.to_string(), + ); + storage_options.insert( + "expires_at_millis".to_string(), + expires_at_millis.to_string(), + ); + + info!( + "Azure credentials vended (web identity): account={}, container={}, path={}, permission={}, expires_at={}, sas_token={}", + account, container, path, self.config.permission, expires_at_millis, redact_credential(&sas_token) + ); + + Ok(VendedCredentials::new(storage_options, expires_at_millis)) + } + + /// Vend credentials using API key validation. + async fn vend_with_api_key( + &self, + account: &str, + container: &str, + path: &str, + api_key: &str, + ) -> Result { + let salt = self + .config + .api_key_salt + .as_ref() + .ok_or_else(|| Error::InvalidInput { + source: "api_key_salt must be configured to use API key authentication".into(), + location: snafu::location!(), + })?; + + let key_hash = Self::hash_api_key(api_key, salt); + + // Look up permission from hash mapping + let permission = self + .config + .api_key_hash_permissions + .get(&key_hash) + .copied() + .ok_or_else(|| { + warn!( + "Invalid API key: hash {} not found in permissions map", + &key_hash[..8] + ); + Error::InvalidInput { + source: "Invalid API key".into(), + location: snafu::location!(), + } + })?; + + debug!( + "Azure vend_with_api_key: account={}, container={}, path={}, permission={}", + account, container, path, permission + ); + + // Use directory-scoped SAS when path is provided, container-level otherwise + let (sas_token, expires_at_millis) = if path.is_empty() { + self.generate_sas_token_with_permission(account, container, permission) + .await? + } else { + self.generate_directory_sas_token(account, container, path, permission) + .await? + }; + + let mut storage_options = HashMap::new(); + storage_options.insert("azure_storage_sas_token".to_string(), sas_token.clone()); + storage_options.insert( + "azure_storage_account_name".to_string(), + account.to_string(), + ); + storage_options.insert( + "expires_at_millis".to_string(), + expires_at_millis.to_string(), + ); + + info!( + "Azure credentials vended (api_key): account={}, container={}, path={}, permission={}, expires_at={}, sas_token={}", + account, container, path, permission, expires_at_millis, redact_credential(&sas_token) + ); + + Ok(VendedCredentials::new(storage_options, expires_at_millis)) + } +} + +/// A custom TokenCredential that wraps a pre-obtained Azure AD access token. +#[derive(Debug)] +struct FederatedTokenCredential { + token: String, +} + +impl FederatedTokenCredential { + fn new(token: String) -> Self { + Self { token } + } +} + +#[async_trait] +impl TokenCredential for FederatedTokenCredential { + async fn get_token( + &self, + _scopes: &[&str], + ) -> std::result::Result { + // Return the pre-obtained token with a 1-hour expiry (conservative estimate) + let expires_on = time::OffsetDateTime::now_utc() + time::Duration::hours(1); + Ok(azure_core::auth::AccessToken::new( + azure_core::auth::Secret::new(self.token.clone()), + expires_on, + )) + } + + async fn clear_cache(&self) -> std::result::Result<(), azure_core::Error> { + Ok(()) + } +} + +#[async_trait] +impl CredentialVendor for AzureCredentialVendor { + async fn vend_credentials( + &self, + table_location: &str, + identity: Option<&Identity>, + ) -> Result { + debug!( + "Azure credential vending: location={}, permission={}, identity={:?}", + table_location, + self.config.permission, + identity.map(|i| format!( + "api_key={}, auth_token={}", + i.api_key.is_some(), + i.auth_token.is_some() + )) + ); + + let url = uri_to_url(table_location)?; + + let container = url.host_str().ok_or_else(|| Error::InvalidInput { + source: format!("Azure URI '{}' missing container", table_location).into(), + location: snafu::location!(), + })?; + + // Extract path for directory-scoped SAS + let path = url.path().trim_start_matches('/'); + + let account = + self.config + .account_name + .as_ref() + .ok_or_else(|| Error::InvalidInput { + source: "Azure credential vending requires 'credential_vendor.azure_account_name' to be set in configuration".into(), + location: snafu::location!(), + })?; + + // Dispatch based on identity + match identity { + Some(id) if id.auth_token.is_some() => { + let auth_token = id.auth_token.as_ref().unwrap(); + self.vend_with_web_identity(account, container, path, auth_token) + .await + } + Some(id) if id.api_key.is_some() => { + let api_key = id.api_key.as_ref().unwrap(); + self.vend_with_api_key(account, container, path, api_key) + .await + } + Some(_) => Err(Error::InvalidInput { + source: "Identity provided but neither auth_token nor api_key is set".into(), + location: snafu::location!(), + }), + None => { + // Static credential vending using DefaultAzureCredential + // Use directory-scoped SAS when path is provided, container-level otherwise + let (sas_token, expires_at_millis) = if path.is_empty() { + self.generate_sas_token(account, container).await? + } else { + self.generate_directory_sas_token( + account, + container, + path, + self.config.permission, + ) + .await? + }; + + let mut storage_options = HashMap::new(); + storage_options.insert("azure_storage_sas_token".to_string(), sas_token.clone()); + storage_options.insert("azure_storage_account_name".to_string(), account.clone()); + storage_options.insert( + "expires_at_millis".to_string(), + expires_at_millis.to_string(), + ); + + info!( + "Azure credentials vended (static): account={}, container={}, path={}, permission={}, expires_at={}, sas_token={}", + account, container, path, self.config.permission, expires_at_millis, redact_credential(&sas_token) + ); + + Ok(VendedCredentials::new(storage_options, expires_at_millis)) + } + } + } + + fn provider_name(&self) -> &'static str { + "azure" + } + + fn permission(&self) -> VendedPermission { + self.config.permission + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_config_builder() { + let config = AzureCredentialVendorConfig::new() + .with_tenant_id("my-tenant-id") + .with_account_name("myaccount") + .with_duration_millis(7200000); + + assert_eq!(config.tenant_id, Some("my-tenant-id".to_string())); + assert_eq!(config.account_name, Some("myaccount".to_string())); + assert_eq!(config.duration_millis, 7200000); + } + + #[test] + fn test_build_sas_permissions_read() { + let permissions = AzureCredentialVendor::build_sas_permissions(VendedPermission::Read); + + assert!(permissions.read, "Read permission should have read=true"); + assert!(permissions.list, "Read permission should have list=true"); + assert!( + !permissions.write, + "Read permission should have write=false" + ); + assert!(!permissions.add, "Read permission should have add=false"); + assert!( + !permissions.create, + "Read permission should have create=false" + ); + assert!( + !permissions.delete, + "Read permission should have delete=false" + ); + } + + #[test] + fn test_build_sas_permissions_write() { + let permissions = AzureCredentialVendor::build_sas_permissions(VendedPermission::Write); + + assert!(permissions.read, "Write permission should have read=true"); + assert!(permissions.list, "Write permission should have list=true"); + assert!(permissions.write, "Write permission should have write=true"); + assert!(permissions.add, "Write permission should have add=true"); + assert!( + permissions.create, + "Write permission should have create=true" + ); + assert!( + !permissions.delete, + "Write permission should have delete=false" + ); + } + + #[test] + fn test_build_sas_permissions_admin() { + let permissions = AzureCredentialVendor::build_sas_permissions(VendedPermission::Admin); + + assert!(permissions.read, "Admin permission should have read=true"); + assert!(permissions.list, "Admin permission should have list=true"); + assert!(permissions.write, "Admin permission should have write=true"); + assert!(permissions.add, "Admin permission should have add=true"); + assert!( + permissions.create, + "Admin permission should have create=true" + ); + assert!( + permissions.delete, + "Admin permission should have delete=true" + ); + } +} diff --git a/rust/lance-namespace-impls/src/credentials/cache.rs b/rust/lance-namespace-impls/src/credentials/cache.rs new file mode 100644 index 00000000000..6e7c6c4dcf7 --- /dev/null +++ b/rust/lance-namespace-impls/src/credentials/cache.rs @@ -0,0 +1,438 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Credential caching for cloud storage access. +//! +//! This module provides a caching wrapper for credential vendors that reduces +//! the number of credential vending requests (e.g., STS calls) by caching +//! credentials until they are close to expiration. +//! +//! ## Caching Strategy +//! +//! - **Cache Key**: Table location + identity hash (api_key hash or auth_token hash) +//! - **TTL**: Half of the credential's remaining lifetime, capped at 30 minutes +//! - **Eviction**: Credentials are evicted when TTL expires or when explicitly cleared +//! +//! ## Example +//! +//! ```ignore +//! use lance_namespace_impls::credentials::cache::CachingCredentialVendor; +//! +//! let vendor = AwsCredentialVendor::new(config).await?; +//! let cached_vendor = CachingCredentialVendor::new(Box::new(vendor)); +//! +//! // First call hits the underlying vendor +//! let creds1 = cached_vendor.vend_credentials("s3://bucket/table", None).await?; +//! +//! // Subsequent calls within TTL return cached credentials +//! let creds2 = cached_vendor.vend_credentials("s3://bucket/table", None).await?; +//! ``` + +use std::collections::HashMap; +use std::hash::{Hash, Hasher}; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use async_trait::async_trait; +use lance_core::Result; +use lance_namespace::models::Identity; +use log::debug; +use tokio::sync::RwLock; + +use super::{CredentialVendor, VendedCredentials, VendedPermission}; + +/// Maximum cache TTL: 30 minutes. +/// Even if credentials are valid for longer, we refresh more frequently +/// to handle clock skew and ensure freshness. +const MAX_CACHE_TTL_SECS: u64 = 30 * 60; + +/// Minimum cache TTL: 1 minute. +/// If credentials expire sooner than this, we don't cache them. +const MIN_CACHE_TTL_SECS: u64 = 60; + +/// A cached credential entry with expiration tracking. +#[derive(Clone)] +struct CacheEntry { + credentials: VendedCredentials, + /// When this cache entry should be considered stale + cached_until: Instant, +} + +impl std::fmt::Debug for CacheEntry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("CacheEntry") + .field("credentials", &"[redacted]") + .field("cached_until", &self.cached_until) + .finish() + } +} + +impl CacheEntry { + fn is_stale(&self) -> bool { + Instant::now() >= self.cached_until + } +} + +/// A caching wrapper for credential vendors. +/// +/// This wrapper caches vended credentials to reduce the number of underlying +/// credential vending operations (e.g., STS calls). Credentials are cached +/// until half their lifetime has passed, capped at 30 minutes. +#[derive(Debug)] +pub struct CachingCredentialVendor { + inner: Box, + cache: Arc>>, +} + +impl CachingCredentialVendor { + /// Create a new caching credential vendor wrapping the given vendor. + pub fn new(inner: Box) -> Self { + Self { + inner, + cache: Arc::new(RwLock::new(HashMap::new())), + } + } + + /// Build a cache key from the table location and identity. + /// + /// The key is a hash of the location and identity fields to ensure + /// different identities get different cached credentials. + fn build_cache_key(table_location: &str, identity: Option<&Identity>) -> String { + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + + table_location.hash(&mut hasher); + + if let Some(id) = identity { + if let Some(ref api_key) = id.api_key { + ":api_key:".hash(&mut hasher); + api_key.hash(&mut hasher); + } + if let Some(ref auth_token) = id.auth_token { + ":auth_token:".hash(&mut hasher); + // Only hash first 64 chars of token to avoid memory issues with large tokens + let token_prefix = if auth_token.len() > 64 { + &auth_token[..64] + } else { + auth_token.as_str() + }; + token_prefix.hash(&mut hasher); + } + } else { + ":no_identity".hash(&mut hasher); + } + + format!("{:016x}", hasher.finish()) + } + + /// Calculate the cache TTL for the given credentials. + /// + /// Returns the TTL as a Duration, or None if the credentials should not be cached. + fn calculate_cache_ttl(credentials: &VendedCredentials) -> Option { + let now_millis = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .expect("time went backwards") + .as_millis() as u64; + + if credentials.expires_at_millis <= now_millis { + // Already expired + return None; + } + + let remaining_millis = credentials.expires_at_millis - now_millis; + let remaining_secs = remaining_millis / 1000; + + // TTL is half the remaining lifetime + let ttl_secs = remaining_secs / 2; + + // Cap between MIN and MAX + if ttl_secs < MIN_CACHE_TTL_SECS { + None // Don't cache if TTL is too short + } else { + Some(Duration::from_secs(ttl_secs.min(MAX_CACHE_TTL_SECS))) + } + } + + /// Clear all cached credentials. + pub async fn clear_cache(&self) { + let mut cache = self.cache.write().await; + cache.clear(); + debug!("Credential cache cleared"); + } + + /// Get the number of cached entries. + pub async fn cache_size(&self) -> usize { + let cache = self.cache.read().await; + cache.len() + } + + /// Remove stale entries from the cache. + pub async fn evict_stale(&self) -> usize { + let mut cache = self.cache.write().await; + let before = cache.len(); + cache.retain(|_, entry| !entry.is_stale()); + let evicted = before - cache.len(); + if evicted > 0 { + debug!("Evicted {} stale credential cache entries", evicted); + } + evicted + } +} + +#[async_trait] +impl CredentialVendor for CachingCredentialVendor { + async fn vend_credentials( + &self, + table_location: &str, + identity: Option<&Identity>, + ) -> Result { + let cache_key = Self::build_cache_key(table_location, identity); + + // Try to get from cache first + { + let cache = self.cache.read().await; + if let Some(entry) = cache.get(&cache_key) { + if !entry.is_stale() && !entry.credentials.is_expired() { + debug!( + "Credential cache hit for location={}, provider={}", + table_location, + self.inner.provider_name() + ); + return Ok(entry.credentials.clone()); + } + } + } + + // Cache miss or stale - vend new credentials + debug!( + "Credential cache miss for location={}, provider={}", + table_location, + self.inner.provider_name() + ); + + let credentials = self + .inner + .vend_credentials(table_location, identity) + .await?; + + // Cache the new credentials if TTL is sufficient + if let Some(ttl) = Self::calculate_cache_ttl(&credentials) { + let entry = CacheEntry { + credentials: credentials.clone(), + cached_until: Instant::now() + ttl, + }; + + let mut cache = self.cache.write().await; + cache.insert(cache_key, entry); + + debug!( + "Cached credentials for location={}, ttl={}s", + table_location, + ttl.as_secs() + ); + } + + Ok(credentials) + } + + fn provider_name(&self) -> &'static str { + self.inner.provider_name() + } + + fn permission(&self) -> VendedPermission { + self.inner.permission() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::atomic::{AtomicU32, Ordering}; + + /// A mock credential vendor for testing. + #[derive(Debug)] + struct MockVendor { + call_count: AtomicU32, + duration_millis: u64, + } + + impl MockVendor { + fn new(duration_millis: u64) -> Self { + Self { + call_count: AtomicU32::new(0), + duration_millis, + } + } + } + + #[async_trait] + impl CredentialVendor for MockVendor { + async fn vend_credentials( + &self, + _table_location: &str, + _identity: Option<&Identity>, + ) -> Result { + self.call_count.fetch_add(1, Ordering::SeqCst); + + let now_millis = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() as u64; + + let mut storage_options = HashMap::new(); + storage_options.insert("test_key".to_string(), "test_value".to_string()); + + Ok(VendedCredentials::new( + storage_options, + now_millis + self.duration_millis, + )) + } + + fn provider_name(&self) -> &'static str { + "mock" + } + + fn permission(&self) -> VendedPermission { + VendedPermission::Read + } + } + + #[test] + fn test_build_cache_key_no_identity() { + let key1 = CachingCredentialVendor::build_cache_key("s3://bucket/table1", None); + let key2 = CachingCredentialVendor::build_cache_key("s3://bucket/table2", None); + let key3 = CachingCredentialVendor::build_cache_key("s3://bucket/table1", None); + + assert_ne!(key1, key2, "Different locations should have different keys"); + assert_eq!(key1, key3, "Same location should have same key"); + } + + #[test] + fn test_build_cache_key_with_identity() { + let identity_api = Identity { + api_key: Some("my-api-key".to_string()), + auth_token: None, + }; + let identity_token = Identity { + api_key: None, + auth_token: Some("my-token".to_string()), + }; + + let key_no_id = CachingCredentialVendor::build_cache_key("s3://bucket/table", None); + let key_api = + CachingCredentialVendor::build_cache_key("s3://bucket/table", Some(&identity_api)); + let key_token = + CachingCredentialVendor::build_cache_key("s3://bucket/table", Some(&identity_token)); + + assert_ne!(key_no_id, key_api, "Identity should change key"); + assert_ne!(key_no_id, key_token, "Identity should change key"); + assert_ne!( + key_api, key_token, + "Different identity types should have different keys" + ); + } + + #[test] + fn test_calculate_cache_ttl() { + let now_millis = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() as u64; + + // Credentials with 1 hour remaining -> TTL should be 30 minutes (capped) + let creds_1h = VendedCredentials::new(HashMap::new(), now_millis + 3600 * 1000); + let ttl = CachingCredentialVendor::calculate_cache_ttl(&creds_1h); + assert_eq!(ttl, Some(Duration::from_secs(MAX_CACHE_TTL_SECS))); + + // Credentials with 10 minutes remaining -> TTL should be 5 minutes + let creds_10m = VendedCredentials::new(HashMap::new(), now_millis + 10 * 60 * 1000); + let ttl = CachingCredentialVendor::calculate_cache_ttl(&creds_10m); + assert_eq!(ttl, Some(Duration::from_secs(5 * 60))); + + // Credentials with 1 minute remaining -> TTL should be None (too short) + let creds_1m = VendedCredentials::new(HashMap::new(), now_millis + 60 * 1000); + let ttl = CachingCredentialVendor::calculate_cache_ttl(&creds_1m); + assert!(ttl.is_none(), "Should not cache short-lived credentials"); + + // Already expired credentials -> None + let creds_expired = VendedCredentials::new(HashMap::new(), now_millis - 1000); + let ttl = CachingCredentialVendor::calculate_cache_ttl(&creds_expired); + assert!(ttl.is_none(), "Should not cache expired credentials"); + } + + #[tokio::test] + async fn test_caching_reduces_calls() { + // Create a mock vendor with 1 hour credentials + let mock = MockVendor::new(3600 * 1000); + let cached = CachingCredentialVendor::new(Box::new(mock)); + + // First call should hit the underlying vendor + let _ = cached + .vend_credentials("s3://bucket/table", None) + .await + .unwrap(); + assert_eq!(cached.cache_size().await, 1); + + // Get reference to inner mock for call count + // We can't easily get the call count from the boxed trait, so we'll check cache size + + // Second call should use cache (cache size stays at 1) + let _ = cached + .vend_credentials("s3://bucket/table", None) + .await + .unwrap(); + assert_eq!(cached.cache_size().await, 1); + + // Different location should create new cache entry + let _ = cached + .vend_credentials("s3://bucket/table2", None) + .await + .unwrap(); + assert_eq!(cached.cache_size().await, 2); + } + + #[tokio::test] + async fn test_clear_cache() { + let mock = MockVendor::new(3600 * 1000); + let cached = CachingCredentialVendor::new(Box::new(mock)); + + let _ = cached + .vend_credentials("s3://bucket/table", None) + .await + .unwrap(); + assert_eq!(cached.cache_size().await, 1); + + cached.clear_cache().await; + assert_eq!(cached.cache_size().await, 0); + } + + #[tokio::test] + async fn test_different_identities_cached_separately() { + let mock = MockVendor::new(3600 * 1000); + let cached = CachingCredentialVendor::new(Box::new(mock)); + + let identity1 = Identity { + api_key: Some("key1".to_string()), + auth_token: None, + }; + let identity2 = Identity { + api_key: Some("key2".to_string()), + auth_token: None, + }; + + // Same location with different identities should cache separately + let _ = cached + .vend_credentials("s3://bucket/table", Some(&identity1)) + .await + .unwrap(); + let _ = cached + .vend_credentials("s3://bucket/table", Some(&identity2)) + .await + .unwrap(); + let _ = cached + .vend_credentials("s3://bucket/table", None) + .await + .unwrap(); + + assert_eq!(cached.cache_size().await, 3); + } +} diff --git a/rust/lance-namespace-impls/src/credentials/gcp.rs b/rust/lance-namespace-impls/src/credentials/gcp.rs new file mode 100644 index 00000000000..0749bdb1b97 --- /dev/null +++ b/rust/lance-namespace-impls/src/credentials/gcp.rs @@ -0,0 +1,999 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! GCP credential vending using downscoped OAuth2 tokens. +//! +//! This module provides credential vending for GCP Cloud Storage by obtaining +//! OAuth2 access tokens and downscoping them using Credential Access Boundaries (CAB). +//! +//! ## Authentication +//! +//! This module uses [Application Default Credentials (ADC)][adc] for authentication. +//! ADC automatically finds credentials based on the environment: +//! +//! 1. **`GOOGLE_APPLICATION_CREDENTIALS` environment variable**: Set this to the path +//! of a service account key file (JSON format) before starting the application. +//! 2. **Well-known file locations**: `~/.config/gcloud/application_default_credentials.json` +//! on Linux/macOS, or the equivalent on Windows. +//! 3. **Metadata server**: When running on GCP (Compute Engine, Cloud Run, GKE, etc.), +//! credentials are automatically obtained from the metadata server. +//! +//! For production deployments on GCP, using the metadata server (option 3) is recommended +//! as it doesn't require managing key files. +//! +//! [adc]: https://cloud.google.com/docs/authentication/application-default-credentials +//! +//! ## Service Account Impersonation +//! +//! For multi-tenant scenarios, you can configure `service_account` to impersonate a +//! different service account. The base credentials (from ADC) must have the +//! `roles/iam.serviceAccountTokenCreator` role on the target service account. +//! +//! ## Permission Scoping +//! +//! Permissions are enforced using GCP's Credential Access Boundaries: +//! - **Read**: `roles/storage.legacyObjectReader` + `roles/storage.objectViewer` (read and list) +//! - **Write**: Read permissions + `roles/storage.legacyBucketWriter` + `roles/storage.objectCreator` +//! - **Admin**: Write permissions + `roles/storage.objectAdmin` (includes delete) +//! +//! The downscoped token is restricted to the specific bucket and path prefix. +//! +//! Note: Legacy roles are used because modern roles like `storage.objectCreator` lack +//! `storage.buckets.get` which many client libraries require. + +use std::collections::HashMap; + +use async_trait::async_trait; +use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine}; +use google_cloud_auth::credentials; +use lance_core::{Error, Result}; +use lance_io::object_store::uri_to_url; +use lance_namespace::models::Identity; +use log::{debug, info, warn}; +use reqwest::Client; +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; + +use super::{redact_credential, CredentialVendor, VendedCredentials, VendedPermission}; + +/// GCP STS token exchange endpoint for downscoping credentials. +const STS_TOKEN_EXCHANGE_URL: &str = "https://sts.googleapis.com/v1/token"; + +/// Configuration for GCP credential vending. +#[derive(Debug, Clone, Default)] +pub struct GcpCredentialVendorConfig { + /// Optional service account to impersonate. + /// + /// When set, the vendor will impersonate this service account using the + /// IAM Credentials API's generateAccessToken endpoint before downscoping. + /// This is useful for multi-tenant scenarios where you want to issue tokens + /// on behalf of different service accounts. + /// + /// The base credentials (from ADC) must have the `roles/iam.serviceAccountTokenCreator` + /// role on this service account. + /// + /// Format: `my-sa@project.iam.gserviceaccount.com` + pub service_account: Option, + + /// Permission level for vended credentials. + /// Default: Read + /// Permissions are enforced via Credential Access Boundaries (CAB). + /// + /// Note: GCP token duration cannot be configured; the token lifetime + /// is determined by the STS endpoint (typically 1 hour). + pub permission: VendedPermission, + + /// Workload Identity Provider resource name for OIDC token exchange. + /// Required when using auth_token identity for Workload Identity Federation. + /// + /// Format: `projects/{project_number}/locations/global/workloadIdentityPools/{pool_id}/providers/{provider_id}` + /// + /// The OIDC token's issuer must match the provider's configuration. + pub workload_identity_provider: Option, + + /// Service account to impersonate after Workload Identity Federation. + /// Optional - if set, the exchanged token will be used to generate an + /// access token for this service account. + /// + /// Format: `my-sa@project.iam.gserviceaccount.com` + pub impersonation_service_account: Option, + + /// Salt for API key hashing. + /// Required when using API key authentication. + /// API keys are hashed as: SHA256(api_key + ":" + salt) + pub api_key_salt: Option, + + /// Map of SHA256(api_key + ":" + salt) -> permission level. + /// When an API key is provided, its hash is looked up in this map. + /// If found, the mapped permission is used instead of the default permission. + pub api_key_hash_permissions: HashMap, +} + +impl GcpCredentialVendorConfig { + /// Create a new default config. + pub fn new() -> Self { + Self::default() + } + + /// Set the service account to impersonate. + /// + /// When set, the vendor uses the IAM Credentials API to generate an access + /// token for this service account, then downscopes it with CAB. + /// + /// The base credentials (from ADC) must have the `roles/iam.serviceAccountTokenCreator` + /// role on this service account. + pub fn with_service_account(mut self, service_account: impl Into) -> Self { + self.service_account = Some(service_account.into()); + self + } + + /// Set the permission level for vended credentials. + pub fn with_permission(mut self, permission: VendedPermission) -> Self { + self.permission = permission; + self + } + + /// Set the Workload Identity Provider for OIDC token exchange. + pub fn with_workload_identity_provider(mut self, provider: impl Into) -> Self { + self.workload_identity_provider = Some(provider.into()); + self + } + + /// Set the service account to impersonate after Workload Identity Federation. + pub fn with_impersonation_service_account( + mut self, + service_account: impl Into, + ) -> Self { + self.impersonation_service_account = Some(service_account.into()); + self + } + + /// Set the API key salt for hashing. + pub fn with_api_key_salt(mut self, salt: impl Into) -> Self { + self.api_key_salt = Some(salt.into()); + self + } + + /// Add an API key hash to permission mapping. + pub fn with_api_key_hash_permission( + mut self, + key_hash: impl Into, + permission: VendedPermission, + ) -> Self { + self.api_key_hash_permissions + .insert(key_hash.into(), permission); + self + } + + /// Set the entire API key hash permissions map. + pub fn with_api_key_hash_permissions( + mut self, + permissions: HashMap, + ) -> Self { + self.api_key_hash_permissions = permissions; + self + } +} + +/// Access boundary rule for a single resource. +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +struct AccessBoundaryRule { + available_resource: String, + available_permissions: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + availability_condition: Option, +} + +/// Condition for access boundary rule. +#[derive(Debug, Clone, Serialize)] +struct AvailabilityCondition { + expression: String, +} + +/// Credential Access Boundary structure. +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +struct CredentialAccessBoundary { + access_boundary: AccessBoundaryInner, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +struct AccessBoundaryInner { + access_boundary_rules: Vec, +} + +/// Response from STS token exchange. +#[derive(Debug, Deserialize)] +struct TokenExchangeResponse { + access_token: String, + #[serde(default)] + expires_in: Option, +} + +/// Response from IAM generateAccessToken API. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +struct GenerateAccessTokenResponse { + access_token: String, + #[allow(dead_code)] + expire_time: String, +} + +/// GCP credential vendor that provides downscoped OAuth2 tokens. +pub struct GcpCredentialVendor { + config: GcpCredentialVendorConfig, + http_client: Client, + credential: credentials::Credential, +} + +impl std::fmt::Debug for GcpCredentialVendor { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("GcpCredentialVendor") + .field("config", &self.config) + .field("credential", &"[credential]") + .finish() + } +} + +impl GcpCredentialVendor { + /// Create a new GCP credential vendor with the specified configuration. + /// + /// Uses [Application Default Credentials (ADC)][adc] for authentication. + /// To use a service account key file, set the `GOOGLE_APPLICATION_CREDENTIALS` + /// environment variable to the file path before starting the application. + /// + /// [adc]: https://cloud.google.com/docs/authentication/application-default-credentials + pub async fn new(config: GcpCredentialVendorConfig) -> Result { + let credential = credentials::create_access_token_credential() + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to create GCP credentials: {}", + e + ))), + location: snafu::location!(), + })?; + + Ok(Self { + config, + http_client: Client::new(), + credential, + }) + } + + /// Parse a GCS URI to extract bucket and prefix. + fn parse_gcs_uri(uri: &str) -> Result<(String, String)> { + let url = uri_to_url(uri)?; + + if url.scheme() != "gs" { + return Err(Error::InvalidInput { + source: format!( + "Unsupported GCS URI scheme '{}', expected 'gs'", + url.scheme() + ) + .into(), + location: snafu::location!(), + }); + } + + let bucket = url + .host_str() + .ok_or_else(|| Error::InvalidInput { + source: format!("GCS URI '{}' missing bucket", uri).into(), + location: snafu::location!(), + })? + .to_string(); + + let prefix = url.path().trim_start_matches('/').to_string(); + + Ok((bucket, prefix)) + } + + /// Get a source token for downscoping. + /// + /// If service_account is configured, impersonates that service account + /// using the IAM Credentials API. Otherwise, uses the configured credential + /// directly. + async fn get_source_token(&self) -> Result { + let base_token = self.credential.get_token().await.map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to get GCP token: {}", + e + ))), + location: snafu::location!(), + })?; + + // If service account impersonation is configured, use generateAccessToken API + if let Some(ref service_account) = self.config.service_account { + return self + .impersonate_service_account(&base_token.token, service_account) + .await; + } + + Ok(base_token.token) + } + + /// Impersonate a service account using the IAM Credentials API. + /// + /// Uses the base token to call generateAccessToken for the target service account. + async fn impersonate_service_account( + &self, + base_token: &str, + service_account: &str, + ) -> Result { + let url = format!( + "https://iamcredentials.googleapis.com/v1/projects/-/serviceAccounts/{}:generateAccessToken", + service_account + ); + + // Request body with cloud-platform scope (required for GCS access) + let body = serde_json::json!({ + "scope": ["https://www.googleapis.com/auth/cloud-platform"] + }); + + let response = self + .http_client + .post(&url) + .bearer_auth(base_token) + .json(&body) + .send() + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to call IAM generateAccessToken: {}", + e + ))), + location: snafu::location!(), + })?; + + if !response.status().is_success() { + let status = response.status(); + let body = response + .text() + .await + .unwrap_or_else(|_| "unknown error".to_string()); + return Err(Error::IO { + source: Box::new(std::io::Error::other(format!( + "IAM generateAccessToken failed for '{}' with status {}: {}", + service_account, status, body + ))), + location: snafu::location!(), + }); + } + + let token_response: GenerateAccessTokenResponse = + response.json().await.map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to parse generateAccessToken response: {}", + e + ))), + location: snafu::location!(), + })?; + + Ok(token_response.access_token) + } + + /// Build Credential Access Boundary for the specified bucket/prefix and permission. + fn build_access_boundary( + bucket: &str, + prefix: &str, + permission: VendedPermission, + ) -> CredentialAccessBoundary { + let bucket_resource = format!("//storage.googleapis.com/projects/_/buckets/{}", bucket); + + let mut rules = vec![]; + + // Build condition expression for path restriction + let condition = if prefix.is_empty() { + None + } else { + let prefix_trimmed = prefix.trim_end_matches('/'); + // CEL expression to restrict access to the specific path prefix. + // We append '/' to ensure exact prefix matching - without it, prefix "data" + // would incorrectly match "data-other/file.txt". + // + // For object access: resource.name must start with "prefix/" + // For list operations: listPrefix must equal "prefix" OR start with "prefix/" + let list_prefix_attr = + "api.getAttribute('storage.googleapis.com/objectListPrefix', '')"; + let expr = format!( + "resource.name.startsWith('projects/_/buckets/{}/objects/{}/') || \ + {list_attr} == '{prefix}' || {list_attr}.startsWith('{prefix}/')", + bucket, + prefix_trimmed, + list_attr = list_prefix_attr, + prefix = prefix_trimmed + ); + Some(AvailabilityCondition { expression: expr }) + }; + + // Read permissions: legacyObjectReader for read + objectViewer for list + // Using legacy roles because modern roles lack storage.buckets.get + rules.push(AccessBoundaryRule { + available_resource: bucket_resource.clone(), + available_permissions: vec![ + "inRole:roles/storage.legacyObjectReader".to_string(), + "inRole:roles/storage.objectViewer".to_string(), + ], + availability_condition: condition.clone(), + }); + + // Write permission: legacyBucketWriter + objectCreator for create/update + if permission.can_write() { + rules.push(AccessBoundaryRule { + available_resource: bucket_resource.clone(), + available_permissions: vec![ + "inRole:roles/storage.legacyBucketWriter".to_string(), + "inRole:roles/storage.objectCreator".to_string(), + ], + availability_condition: condition.clone(), + }); + } + + // Admin permission: objectAdmin for delete + if permission.can_delete() { + rules.push(AccessBoundaryRule { + available_resource: bucket_resource, + available_permissions: vec!["inRole:roles/storage.objectAdmin".to_string()], + availability_condition: condition, + }); + } + + CredentialAccessBoundary { + access_boundary: AccessBoundaryInner { + access_boundary_rules: rules, + }, + } + } + + /// Exchange source token for a downscoped token using STS. + async fn downscope_token( + &self, + source_token: &str, + access_boundary: &CredentialAccessBoundary, + ) -> Result<(String, u64)> { + let options_json = serde_json::to_string(access_boundary).map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to serialize access boundary: {}", + e + ))), + location: snafu::location!(), + })?; + + let params = [ + ( + "grant_type", + "urn:ietf:params:oauth:grant-type:token-exchange", + ), + ( + "subject_token_type", + "urn:ietf:params:oauth:token-type:access_token", + ), + ( + "requested_token_type", + "urn:ietf:params:oauth:token-type:access_token", + ), + ("subject_token", source_token), + ("options", &options_json), + ]; + + let response = self + .http_client + .post(STS_TOKEN_EXCHANGE_URL) + .form(¶ms) + .send() + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to call STS token exchange: {}", + e + ))), + location: snafu::location!(), + })?; + + if !response.status().is_success() { + let status = response.status(); + let body = response + .text() + .await + .unwrap_or_else(|_| "unknown error".to_string()); + return Err(Error::IO { + source: Box::new(std::io::Error::other(format!( + "STS token exchange failed with status {}: {}", + status, body + ))), + location: snafu::location!(), + }); + } + + let token_response: TokenExchangeResponse = + response.json().await.map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to parse STS response: {}", + e + ))), + location: snafu::location!(), + })?; + + // Calculate expiration time + // Use expires_in from response if available, otherwise default to 1 hour + let expires_in_secs = token_response.expires_in.unwrap_or(3600); + let expires_at_millis = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .expect("time went backwards") + .as_millis() as u64 + + expires_in_secs * 1000; + + Ok((token_response.access_token, expires_at_millis)) + } + + /// Hash an API key using SHA-256 with salt (Polaris pattern). + /// Format: SHA256(api_key + ":" + salt) as hex string. + pub fn hash_api_key(api_key: &str, salt: &str) -> String { + let mut hasher = Sha256::new(); + hasher.update(format!("{}:{}", api_key, salt)); + format!("{:x}", hasher.finalize()) + } + + /// Extract a session name from a JWT token (best effort, no validation). + /// Decodes the payload and extracts 'sub' or 'email' claim. + /// Falls back to "lance-gcp-identity" if parsing fails. + fn derive_session_name_from_token(token: &str) -> String { + // JWT format: header.payload.signature + let parts: Vec<&str> = token.split('.').collect(); + if parts.len() != 3 { + return "lance-gcp-identity".to_string(); + } + + // Decode the payload (second part) + let payload = match URL_SAFE_NO_PAD.decode(parts[1]) { + Ok(bytes) => bytes, + Err(_) => { + // Try standard base64 as fallback + match base64::engine::general_purpose::STANDARD_NO_PAD.decode(parts[1]) { + Ok(bytes) => bytes, + Err(_) => return "lance-gcp-identity".to_string(), + } + } + }; + + // Parse as JSON and extract 'sub' or 'email' + let json: serde_json::Value = match serde_json::from_slice(&payload) { + Ok(v) => v, + Err(_) => return "lance-gcp-identity".to_string(), + }; + + let subject = json + .get("sub") + .or_else(|| json.get("email")) + .and_then(|v| v.as_str()) + .unwrap_or("unknown"); + + // Sanitize: keep only alphanumeric, @, -, . + let sanitized: String = subject + .chars() + .filter(|c| c.is_alphanumeric() || *c == '@' || *c == '-' || *c == '.') + .collect(); + + format!("lance-{}", sanitized) + } + + /// Normalize the Workload Identity Provider to the full audience format expected by GCP STS. + /// + /// GCP STS expects audience in the format: + /// `//iam.googleapis.com/projects/{project}/locations/global/workloadIdentityPools/{pool}/providers/{provider}` + /// + /// This function accepts either: + /// - Full format: `//iam.googleapis.com/projects/...` + /// - Short format: `projects/...` (will be prefixed with `//iam.googleapis.com/`) + fn normalize_workload_identity_audience(provider: &str) -> String { + const IAM_PREFIX: &str = "//iam.googleapis.com/"; + if provider.starts_with(IAM_PREFIX) { + provider.to_string() + } else { + format!("{}{}", IAM_PREFIX, provider) + } + } + + /// Exchange an OIDC token for GCP access token using Workload Identity Federation. + /// + /// This requires: + /// 1. A Workload Identity Pool and Provider configured in GCP + /// 2. The OIDC token's issuer to match the provider's configuration + /// 3. Optionally, a service account to impersonate after token exchange + async fn exchange_oidc_for_gcp_token(&self, oidc_token: &str) -> Result { + let workload_identity_provider = self + .config + .workload_identity_provider + .as_ref() + .ok_or_else(|| Error::InvalidInput { + source: "gcp_workload_identity_provider must be configured for OIDC token exchange" + .into(), + location: snafu::location!(), + })?; + + // Normalize audience to full format expected by GCP STS + let audience = Self::normalize_workload_identity_audience(workload_identity_provider); + + // Exchange OIDC token for GCP federated token via STS + let params = [ + ( + "grant_type", + "urn:ietf:params:oauth:grant-type:token-exchange", + ), + ("subject_token_type", "urn:ietf:params:oauth:token-type:jwt"), + ( + "requested_token_type", + "urn:ietf:params:oauth:token-type:access_token", + ), + ("subject_token", oidc_token), + ("audience", audience.as_str()), + ("scope", "https://www.googleapis.com/auth/cloud-platform"), + ]; + + let response = self + .http_client + .post(STS_TOKEN_EXCHANGE_URL) + .form(¶ms) + .send() + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to exchange OIDC token for GCP token: {}", + e + ))), + location: snafu::location!(), + })?; + + if !response.status().is_success() { + let status = response.status(); + let body = response.text().await.unwrap_or_default(); + return Err(Error::IO { + source: Box::new(std::io::Error::other(format!( + "GCP STS token exchange failed with status {}: {}", + status, body + ))), + location: snafu::location!(), + }); + } + + let token_response: TokenExchangeResponse = + response.json().await.map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to parse GCP STS token response: {}", + e + ))), + location: snafu::location!(), + })?; + + let federated_token = token_response.access_token; + + // If impersonation is configured, use the federated token to get an impersonated token + if let Some(ref service_account) = self.config.impersonation_service_account { + return self + .impersonate_service_account(&federated_token, service_account) + .await; + } + + Ok(federated_token) + } + + /// Vend credentials using Workload Identity Federation (for auth_token). + async fn vend_with_web_identity( + &self, + bucket: &str, + prefix: &str, + auth_token: &str, + ) -> Result { + let session_name = Self::derive_session_name_from_token(auth_token); + debug!( + "GCP vend_with_web_identity: bucket={}, prefix={}, session={}", + bucket, prefix, session_name + ); + + // Exchange OIDC token for GCP token + let gcp_token = self.exchange_oidc_for_gcp_token(auth_token).await?; + + // Build access boundary and downscope + let access_boundary = Self::build_access_boundary(bucket, prefix, self.config.permission); + let (downscoped_token, expires_at_millis) = + self.downscope_token(&gcp_token, &access_boundary).await?; + + let mut storage_options = HashMap::new(); + storage_options.insert("google_storage_token".to_string(), downscoped_token.clone()); + storage_options.insert( + "expires_at_millis".to_string(), + expires_at_millis.to_string(), + ); + + info!( + "GCP credentials vended (web identity): bucket={}, prefix={}, permission={}, expires_at={}, token={}", + bucket, prefix, self.config.permission, expires_at_millis, redact_credential(&downscoped_token) + ); + + Ok(VendedCredentials::new(storage_options, expires_at_millis)) + } + + /// Vend credentials using API key validation. + async fn vend_with_api_key( + &self, + bucket: &str, + prefix: &str, + api_key: &str, + ) -> Result { + let salt = self + .config + .api_key_salt + .as_ref() + .ok_or_else(|| Error::InvalidInput { + source: "api_key_salt must be configured to use API key authentication".into(), + location: snafu::location!(), + })?; + + let key_hash = Self::hash_api_key(api_key, salt); + + // Look up permission from hash mapping + let permission = self + .config + .api_key_hash_permissions + .get(&key_hash) + .copied() + .ok_or_else(|| { + warn!( + "Invalid API key: hash {} not found in permissions map", + &key_hash[..8] + ); + Error::InvalidInput { + source: "Invalid API key".into(), + location: snafu::location!(), + } + })?; + + debug!( + "GCP vend_with_api_key: bucket={}, prefix={}, permission={}", + bucket, prefix, permission + ); + + // Get source token using ADC and downscope with the API key's permission + let source_token = self.get_source_token().await?; + let access_boundary = Self::build_access_boundary(bucket, prefix, permission); + let (downscoped_token, expires_at_millis) = self + .downscope_token(&source_token, &access_boundary) + .await?; + + let mut storage_options = HashMap::new(); + storage_options.insert("google_storage_token".to_string(), downscoped_token.clone()); + storage_options.insert( + "expires_at_millis".to_string(), + expires_at_millis.to_string(), + ); + + info!( + "GCP credentials vended (api_key): bucket={}, prefix={}, permission={}, expires_at={}, token={}", + bucket, prefix, permission, expires_at_millis, redact_credential(&downscoped_token) + ); + + Ok(VendedCredentials::new(storage_options, expires_at_millis)) + } +} + +#[async_trait] +impl CredentialVendor for GcpCredentialVendor { + async fn vend_credentials( + &self, + table_location: &str, + identity: Option<&Identity>, + ) -> Result { + debug!( + "GCP credential vending: location={}, permission={}, identity={:?}", + table_location, + self.config.permission, + identity.map(|i| format!( + "api_key={}, auth_token={}", + i.api_key.is_some(), + i.auth_token.is_some() + )) + ); + + let (bucket, prefix) = Self::parse_gcs_uri(table_location)?; + + // Dispatch based on identity + match identity { + Some(id) if id.auth_token.is_some() => { + let auth_token = id.auth_token.as_ref().unwrap(); + self.vend_with_web_identity(&bucket, &prefix, auth_token) + .await + } + Some(id) if id.api_key.is_some() => { + let api_key = id.api_key.as_ref().unwrap(); + self.vend_with_api_key(&bucket, &prefix, api_key).await + } + Some(_) => Err(Error::InvalidInput { + source: "Identity provided but neither auth_token nor api_key is set".into(), + location: snafu::location!(), + }), + None => { + // Static credential vending using ADC + let source_token = self.get_source_token().await?; + let access_boundary = + Self::build_access_boundary(&bucket, &prefix, self.config.permission); + let (downscoped_token, expires_at_millis) = self + .downscope_token(&source_token, &access_boundary) + .await?; + + let mut storage_options = HashMap::new(); + storage_options + .insert("google_storage_token".to_string(), downscoped_token.clone()); + storage_options.insert( + "expires_at_millis".to_string(), + expires_at_millis.to_string(), + ); + + info!( + "GCP credentials vended (static): bucket={}, prefix={}, permission={}, expires_at={}, token={}", + bucket, prefix, self.config.permission, expires_at_millis, redact_credential(&downscoped_token) + ); + + Ok(VendedCredentials::new(storage_options, expires_at_millis)) + } + } + } + + fn provider_name(&self) -> &'static str { + "gcp" + } + + fn permission(&self) -> VendedPermission { + self.config.permission + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_gcs_uri() { + let (bucket, prefix) = GcpCredentialVendor::parse_gcs_uri("gs://my-bucket/path/to/table") + .expect("should parse"); + assert_eq!(bucket, "my-bucket"); + assert_eq!(prefix, "path/to/table"); + + let (bucket, prefix) = + GcpCredentialVendor::parse_gcs_uri("gs://my-bucket/").expect("should parse"); + assert_eq!(bucket, "my-bucket"); + assert_eq!(prefix, ""); + + let (bucket, prefix) = + GcpCredentialVendor::parse_gcs_uri("gs://my-bucket").expect("should parse"); + assert_eq!(bucket, "my-bucket"); + assert_eq!(prefix, ""); + } + + #[test] + fn test_parse_gcs_uri_invalid() { + // Wrong scheme - should fail + let result = GcpCredentialVendor::parse_gcs_uri("s3://bucket/path"); + assert!(result.is_err()); + + // Missing bucket + let result = GcpCredentialVendor::parse_gcs_uri("gs:///path"); + assert!(result.is_err()); + + // Invalid URI format + let result = GcpCredentialVendor::parse_gcs_uri("not-a-uri"); + assert!(result.is_err()); + + // Empty string + let result = GcpCredentialVendor::parse_gcs_uri(""); + assert!(result.is_err()); + } + + #[test] + fn test_config_builder() { + let config = GcpCredentialVendorConfig::new() + .with_service_account("my-sa@project.iam.gserviceaccount.com") + .with_permission(VendedPermission::Write); + + assert_eq!( + config.service_account, + Some("my-sa@project.iam.gserviceaccount.com".to_string()) + ); + assert_eq!(config.permission, VendedPermission::Write); + } + + #[test] + fn test_build_access_boundary_read() { + let boundary = GcpCredentialVendor::build_access_boundary( + "my-bucket", + "path/to/data", + VendedPermission::Read, + ); + + let rules = &boundary.access_boundary.access_boundary_rules; + assert_eq!(rules.len(), 1, "Read should have 1 rule"); + + let permissions = &rules[0].available_permissions; + assert!(permissions.contains(&"inRole:roles/storage.legacyObjectReader".to_string())); + assert!(permissions.contains(&"inRole:roles/storage.objectViewer".to_string())); + assert!(rules[0].availability_condition.is_some()); + } + + #[test] + fn test_build_access_boundary_write() { + let boundary = GcpCredentialVendor::build_access_boundary( + "my-bucket", + "path/to/data", + VendedPermission::Write, + ); + + let rules = &boundary.access_boundary.access_boundary_rules; + assert_eq!(rules.len(), 2, "Write should have 2 rules"); + + let permissions: Vec<_> = rules + .iter() + .flat_map(|r| r.available_permissions.iter()) + .collect(); + assert!(permissions.contains(&&"inRole:roles/storage.legacyObjectReader".to_string())); + assert!(permissions.contains(&&"inRole:roles/storage.objectViewer".to_string())); + assert!(permissions.contains(&&"inRole:roles/storage.legacyBucketWriter".to_string())); + assert!(permissions.contains(&&"inRole:roles/storage.objectCreator".to_string())); + } + + #[test] + fn test_build_access_boundary_admin() { + let boundary = GcpCredentialVendor::build_access_boundary( + "my-bucket", + "path/to/data", + VendedPermission::Admin, + ); + + let rules = &boundary.access_boundary.access_boundary_rules; + assert_eq!(rules.len(), 3, "Admin should have 3 rules"); + + let permissions: Vec<_> = rules + .iter() + .flat_map(|r| r.available_permissions.iter()) + .collect(); + assert!(permissions.contains(&&"inRole:roles/storage.legacyObjectReader".to_string())); + assert!(permissions.contains(&&"inRole:roles/storage.objectViewer".to_string())); + assert!(permissions.contains(&&"inRole:roles/storage.legacyBucketWriter".to_string())); + assert!(permissions.contains(&&"inRole:roles/storage.objectCreator".to_string())); + assert!(permissions.contains(&&"inRole:roles/storage.objectAdmin".to_string())); + } + + #[test] + fn test_build_access_boundary_no_prefix() { + let boundary = + GcpCredentialVendor::build_access_boundary("my-bucket", "", VendedPermission::Read); + + let rules = &boundary.access_boundary.access_boundary_rules; + assert_eq!(rules.len(), 1); + // No condition when prefix is empty (full bucket access) + assert!(rules[0].availability_condition.is_none()); + } + + #[test] + fn test_normalize_workload_identity_audience() { + // Short format should be prefixed + let short = + "projects/123456/locations/global/workloadIdentityPools/my-pool/providers/my-provider"; + let normalized = GcpCredentialVendor::normalize_workload_identity_audience(short); + assert_eq!( + normalized, + "//iam.googleapis.com/projects/123456/locations/global/workloadIdentityPools/my-pool/providers/my-provider" + ); + + // Full format should be unchanged + let full = "//iam.googleapis.com/projects/123456/locations/global/workloadIdentityPools/my-pool/providers/my-provider"; + let normalized = GcpCredentialVendor::normalize_workload_identity_audience(full); + assert_eq!(normalized, full); + + // Edge case: already has prefix (idempotent) + let normalized_again = + GcpCredentialVendor::normalize_workload_identity_audience(&normalized); + assert_eq!(normalized_again, full); + } +} diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index fd5a63a0848..875df33e580 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -16,23 +16,42 @@ use lance::dataset::{Dataset, WriteParams}; use lance::session::Session; use lance_io::object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry}; use object_store::path::Path; +use object_store::{Error as ObjectStoreError, ObjectStore as OSObjectStore, PutMode, PutOptions}; use std::collections::HashMap; use std::io::Cursor; use std::sync::Arc; +use crate::context::DynamicContextProvider; use lance_namespace::models::{ CreateEmptyTableRequest, CreateEmptyTableResponse, CreateNamespaceRequest, - CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, DescribeNamespaceRequest, - DescribeNamespaceResponse, DescribeTableRequest, DescribeTableResponse, DropNamespaceRequest, - DropNamespaceResponse, DropTableRequest, DropTableResponse, ListNamespacesRequest, - ListNamespacesResponse, ListTablesRequest, ListTablesResponse, NamespaceExistsRequest, - TableExistsRequest, + CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, DeclareTableRequest, + DeclareTableResponse, DescribeNamespaceRequest, DescribeNamespaceResponse, + DescribeTableRequest, DescribeTableResponse, DropNamespaceRequest, DropNamespaceResponse, + DropTableRequest, DropTableResponse, Identity, ListNamespacesRequest, ListNamespacesResponse, + ListTablesRequest, ListTablesResponse, NamespaceExistsRequest, TableExistsRequest, }; use lance_core::{box_error, Error, Result}; use lance_namespace::schema::arrow_schema_to_json; use lance_namespace::LanceNamespace; +use crate::credentials::{ + create_credential_vendor_for_location, has_credential_vendor_config, CredentialVendor, +}; + +/// Result of checking table status atomically. +/// +/// This struct captures the state of a table directory in a single snapshot, +/// avoiding race conditions between checking existence and other status flags. +pub(crate) struct TableStatus { + /// Whether the table directory exists (has any files) + pub(crate) exists: bool, + /// Whether the table has a `.lance-deregistered` marker file + pub(crate) is_deregistered: bool, + /// Whether the table has a `.lance-reserved` marker file (declared but not written) + pub(crate) has_reserved_file: bool, +} + /// Builder for creating a DirectoryNamespace. /// /// This builder provides a fluent API for configuring and establishing @@ -67,7 +86,7 @@ use lance_namespace::LanceNamespace; /// # Ok(()) /// # } /// ``` -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct DirectoryNamespaceBuilder { root: String, storage_options: Option>, @@ -75,6 +94,27 @@ pub struct DirectoryNamespaceBuilder { manifest_enabled: bool, dir_listing_enabled: bool, inline_optimization_enabled: bool, + credential_vendor_properties: HashMap, + context_provider: Option>, +} + +impl std::fmt::Debug for DirectoryNamespaceBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DirectoryNamespaceBuilder") + .field("root", &self.root) + .field("storage_options", &self.storage_options) + .field("manifest_enabled", &self.manifest_enabled) + .field("dir_listing_enabled", &self.dir_listing_enabled) + .field( + "inline_optimization_enabled", + &self.inline_optimization_enabled, + ) + .field( + "context_provider", + &self.context_provider.as_ref().map(|_| "Some(...)"), + ) + .finish() + } } impl DirectoryNamespaceBuilder { @@ -91,6 +131,8 @@ impl DirectoryNamespaceBuilder { manifest_enabled: true, dir_listing_enabled: true, // Default to enabled for backwards compatibility inline_optimization_enabled: true, + credential_vendor_properties: HashMap::new(), + context_provider: None, } } @@ -132,6 +174,29 @@ impl DirectoryNamespaceBuilder { /// - `inline_optimization_enabled`: Enable inline optimization of __manifest table (optional, default: true) /// - `storage.*`: Storage options (optional, prefix will be stripped) /// + /// Credential vendor properties (prefixed with `credential_vendor.`, prefix is stripped): + /// - `credential_vendor.enabled`: Set to "true" to enable credential vending (required) + /// - `credential_vendor.permission`: Permission level: read, write, or admin (default: read) + /// + /// AWS-specific properties (for s3:// locations): + /// - `credential_vendor.aws_role_arn`: AWS IAM role ARN (required for AWS) + /// - `credential_vendor.aws_external_id`: AWS external ID (optional) + /// - `credential_vendor.aws_region`: AWS region (optional) + /// - `credential_vendor.aws_role_session_name`: AWS role session name (optional) + /// - `credential_vendor.aws_duration_millis`: Credential duration in ms (default: 3600000, range: 15min-12hrs) + /// + /// GCP-specific properties (for gs:// locations): + /// - `credential_vendor.gcp_service_account`: Service account to impersonate (optional) + /// + /// Note: GCP uses Application Default Credentials (ADC). To use a service account key file, + /// set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable before starting. + /// GCP token duration cannot be configured; it's determined by the STS endpoint (typically 1 hour). + /// + /// Azure-specific properties (for az:// locations): + /// - `credential_vendor.azure_account_name`: Azure storage account name (required for Azure) + /// - `credential_vendor.azure_tenant_id`: Azure tenant ID (optional) + /// - `credential_vendor.azure_duration_millis`: Credential duration in ms (default: 3600000, up to 7 days) + /// /// # Arguments /// /// * `properties` - Configuration properties @@ -209,6 +274,17 @@ impl DirectoryNamespaceBuilder { .and_then(|v| v.parse::().ok()) .unwrap_or(true); + // Extract credential vendor properties (properties prefixed with "credential_vendor.") + // The prefix is stripped to get short property names + // The build() method will check if enabled=true before creating the vendor + let credential_vendor_properties: HashMap = properties + .iter() + .filter_map(|(k, v)| { + k.strip_prefix("credential_vendor.") + .map(|key| (key.to_string(), v.clone())) + }) + .collect(); + Ok(Self { root: root.trim_end_matches('/').to_string(), storage_options, @@ -216,6 +292,8 @@ impl DirectoryNamespaceBuilder { manifest_enabled, dir_listing_enabled, inline_optimization_enabled, + credential_vendor_properties, + context_provider: None, }) } @@ -258,6 +336,69 @@ impl DirectoryNamespaceBuilder { self } + /// Add a credential vendor property. + /// + /// Use short property names without the `credential_vendor.` prefix. + /// Common properties: `enabled`, `permission`. + /// AWS properties: `aws_role_arn`, `aws_external_id`, `aws_region`, `aws_role_session_name`, `aws_duration_millis`. + /// GCP properties: `gcp_service_account`. + /// Azure properties: `azure_account_name`, `azure_tenant_id`, `azure_duration_millis`. + /// + /// # Arguments + /// + /// * `key` - Property key (e.g., "enabled", "aws_role_arn") + /// * `value` - Property value + /// + /// # Example + /// + /// ```no_run + /// # use lance_namespace_impls::DirectoryNamespaceBuilder; + /// # async fn example() -> Result<(), Box> { + /// let namespace = DirectoryNamespaceBuilder::new("s3://my-bucket/data") + /// .credential_vendor_property("enabled", "true") + /// .credential_vendor_property("aws_role_arn", "arn:aws:iam::123456789012:role/MyRole") + /// .credential_vendor_property("permission", "read") + /// .build() + /// .await?; + /// # Ok(()) + /// # } + /// ``` + pub fn credential_vendor_property( + mut self, + key: impl Into, + value: impl Into, + ) -> Self { + self.credential_vendor_properties + .insert(key.into(), value.into()); + self + } + + /// Add multiple credential vendor properties. + /// + /// Use short property names without the `credential_vendor.` prefix. + /// + /// # Arguments + /// + /// * `properties` - HashMap of credential vendor properties to add + pub fn credential_vendor_properties(mut self, properties: HashMap) -> Self { + self.credential_vendor_properties.extend(properties); + self + } + + /// Set a dynamic context provider for per-request context. + /// + /// The provider can be used to generate additional context for operations. + /// For DirectoryNamespace, the context is stored but not directly used + /// in operations (unlike RestNamespace where it's converted to HTTP headers). + /// + /// # Arguments + /// + /// * `provider` - The context provider implementation + pub fn context_provider(mut self, provider: Arc) -> Self { + self.context_provider = Some(provider); + self + } + /// Build the DirectoryNamespace. /// /// # Returns @@ -300,6 +441,16 @@ impl DirectoryNamespaceBuilder { None }; + // Create credential vendor once during initialization if enabled + let credential_vendor = if has_credential_vendor_config(&self.credential_vendor_properties) + { + create_credential_vendor_for_location(&self.root, &self.credential_vendor_properties) + .await? + .map(Arc::from) + } else { + None + }; + Ok(DirectoryNamespace { root: self.root, storage_options: self.storage_options, @@ -308,6 +459,8 @@ impl DirectoryNamespaceBuilder { base_path, manifest_ns, dir_listing_enabled: self.dir_listing_enabled, + credential_vendor, + context_provider: self.context_provider, }) } @@ -318,8 +471,11 @@ impl DirectoryNamespaceBuilder { session: &Option>, ) -> Result<(Arc, Path)> { // Build ObjectStoreParams from storage options + let accessor = storage_options.clone().map(|opts| { + Arc::new(lance_io::object_store::StorageOptionsAccessor::with_static_options(opts)) + }); let params = ObjectStoreParams { - storage_options: storage_options.clone(), + storage_options_accessor: accessor, ..Default::default() }; @@ -357,6 +513,14 @@ impl DirectoryNamespaceBuilder { /// /// When `dir_listing_enabled=true`, the namespace falls back to directory scanning for tables not /// found in the manifest, enabling gradual migration. +/// +/// ## Credential Vending +/// +/// When credential vendor properties are configured, `describe_table` will vend temporary +/// credentials based on the table location URI. The vendor type is auto-selected: +/// - `s3://` locations use AWS STS AssumeRole +/// - `gs://` locations use GCP OAuth2 tokens +/// - `az://` locations use Azure SAS tokens pub struct DirectoryNamespace { root: String, storage_options: Option>, @@ -366,6 +530,13 @@ pub struct DirectoryNamespace { base_path: Path, manifest_ns: Option>, dir_listing_enabled: bool, + /// Credential vendor created once during initialization. + /// Used to vend temporary credentials for table access. + credential_vendor: Option>, + /// Dynamic context provider for per-request context. + /// Stored but not directly used in operations (available for future extensions). + #[allow(dead_code)] + context_provider: Option>, } impl std::fmt::Debug for DirectoryNamespace { @@ -435,6 +606,13 @@ impl DirectoryNamespace { } let table_name = &path[..path.len() - 6]; + + // Use atomic check to skip deregistered tables and declared-but-not-written tables + let status = self.check_table_status(table_name).await; + if status.is_deregistered || status.has_reserved_file { + continue; + } + tables.push(table_name.to_string()); } @@ -496,6 +674,102 @@ impl DirectoryNamespace { .child(".lance-reserved") } + /// Get the deregistered marker file path for a table + fn table_deregistered_file_path(&self, table_name: &str) -> Path { + self.base_path + .child(format!("{}.lance", table_name).as_str()) + .child(".lance-deregistered") + } + + /// Atomically check table existence and deregistration status. + /// + /// This performs a single directory listing to get a consistent snapshot of the + /// table's state, avoiding race conditions between checking existence and + /// checking deregistration status. + pub(crate) async fn check_table_status(&self, table_name: &str) -> TableStatus { + let table_path = self.table_path(table_name); + match self.object_store.read_dir(table_path).await { + Ok(entries) => { + let exists = !entries.is_empty(); + let is_deregistered = entries.iter().any(|e| e.ends_with(".lance-deregistered")); + let has_reserved_file = entries.iter().any(|e| e.ends_with(".lance-reserved")); + TableStatus { + exists, + is_deregistered, + has_reserved_file, + } + } + Err(_) => TableStatus { + exists: false, + is_deregistered: false, + has_reserved_file: false, + }, + } + } + + /// Atomically create a marker file using put_if_not_exists semantics. + /// + /// This uses `PutMode::Create` which will fail if the file already exists, + /// providing atomic creation semantics to avoid race conditions. + /// + /// Returns Ok(()) if the file was created successfully. + /// Returns Err with appropriate message if the file already exists or other error. + async fn put_marker_file_atomic( + &self, + path: &Path, + file_description: &str, + ) -> std::result::Result<(), String> { + let put_opts = PutOptions { + mode: PutMode::Create, + ..Default::default() + }; + + match self + .object_store + .inner + .put_opts(path, bytes::Bytes::new().into(), put_opts) + .await + { + Ok(_) => Ok(()), + Err(ObjectStoreError::AlreadyExists { .. }) + | Err(ObjectStoreError::Precondition { .. }) => { + Err(format!("{} already exists", file_description)) + } + Err(e) => Err(format!("Failed to create {}: {}", file_description, e)), + } + } + + /// Get storage options for a table, using credential vending if configured. + /// + /// If credential vendor properties are configured and the table location matches + /// a supported cloud provider, this will create an appropriate vendor and vend + /// temporary credentials scoped to the table location. Otherwise, returns the + /// static storage options. + /// + /// The vendor type is auto-selected based on the table URI: + /// - `s3://` locations use AWS STS AssumeRole + /// - `gs://` locations use GCP OAuth2 tokens + /// - `az://` locations use Azure SAS tokens + /// + /// The permission level (Read, Write, Admin) is configured at namespace + /// initialization time via the `credential_vendor_permission` property. + /// + /// # Arguments + /// + /// * `table_uri` - The full URI of the table + /// * `identity` - Optional identity from the request for identity-based credential vending + async fn get_storage_options_for_table( + &self, + table_uri: &str, + identity: Option<&Identity>, + ) -> Result>> { + if let Some(ref vendor) = self.credential_vendor { + let vended = vendor.vend_credentials(table_uri, identity).await?; + return Ok(Some(vended.storage_options)); + } + Ok(self.storage_options.clone()) + } + /// Migrate directory-based tables to the manifest. /// /// This is a one-time migration operation that: @@ -601,8 +875,10 @@ impl LanceNamespace for DirectoryNamespace { } Self::validate_root_namespace_id(&request.id)?; + #[allow(clippy::needless_update)] Ok(DescribeNamespaceResponse { properties: Some(HashMap::new()), + ..Default::default() }) } @@ -735,7 +1011,20 @@ impl LanceNamespace for DirectoryNamespace { async fn describe_table(&self, request: DescribeTableRequest) -> Result { if let Some(ref manifest_ns) = self.manifest_ns { match manifest_ns.describe_table(request.clone()).await { - Ok(response) => return Ok(response), + Ok(mut response) => { + // Only apply identity-based credential vending when explicitly requested + if request.vend_credentials == Some(true) && self.credential_vendor.is_some() { + if let Some(ref table_uri) = response.table_uri { + let identity = request.identity.as_deref(); + response.storage_options = self + .get_storage_options_for_table(table_uri, identity) + .await?; + } + } else if request.vend_credentials == Some(false) { + response.storage_options = None; + } + return Ok(response); + } Err(_) if self.dir_listing_enabled && request.id.as_ref().is_some_and(|id| id.len() == 1) => @@ -749,21 +1038,52 @@ impl LanceNamespace for DirectoryNamespace { let table_name = Self::table_name_from_id(&request.id)?; let table_uri = self.table_full_uri(&table_name); - let table_path = self.table_path(&table_name); - let dir_exists = self - .object_store - .read_dir(table_path) - .await - .map(|entries| !entries.is_empty()) - .unwrap_or(false); + // Atomically check table existence and deregistration status + let status = self.check_table_status(&table_name).await; - if !dir_exists { + if !status.exists { return Err(Error::Namespace { source: format!("Table does not exist: {}", table_name).into(), location: snafu::location!(), }); } + if status.is_deregistered { + return Err(Error::Namespace { + source: format!("Table is deregistered: {}", table_name).into(), + location: snafu::location!(), + }); + } + + let load_detailed_metadata = request.load_detailed_metadata.unwrap_or(false); + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); + let identity = request.identity.as_deref(); + + // If not loading detailed metadata, return minimal response with just location + if !load_detailed_metadata { + let storage_options = if vend_credentials { + self.get_storage_options_for_table(&table_uri, identity) + .await? + } else { + None + }; + return Ok(DescribeTableResponse { + table: Some(table_name), + namespace: request.id.as_ref().map(|id| { + if id.len() > 1 { + id[..id.len() - 1].to_vec() + } else { + vec![] + } + }), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), + storage_options, + ..Default::default() + }); + } + // Try to load the dataset to get real information match Dataset::open(&table_uri).await { Ok(mut dataset) => { @@ -772,32 +1092,61 @@ impl LanceNamespace for DirectoryNamespace { dataset = dataset.checkout_version(requested_version as u64).await?; } - let version = dataset.version().version; + let version_info = dataset.version(); let lance_schema = dataset.schema(); let arrow_schema: arrow_schema::Schema = lance_schema.into(); let json_schema = arrow_schema_to_json(&arrow_schema)?; + let storage_options = if vend_credentials { + self.get_storage_options_for_table(&table_uri, identity) + .await? + } else { + None + }; + + // Convert BTreeMap to HashMap for the response + let metadata: std::collections::HashMap = + version_info.metadata.into_iter().collect(); + Ok(DescribeTableResponse { - version: Some(version as i64), - location: Some(table_uri), + table: Some(table_name), + namespace: request.id.as_ref().map(|id| { + if id.len() > 1 { + id[..id.len() - 1].to_vec() + } else { + vec![] + } + }), + version: Some(version_info.version as i64), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), schema: Some(Box::new(json_schema)), - properties: None, - storage_options: self.storage_options.clone(), + storage_options, + metadata: Some(metadata), + ..Default::default() }) } Err(err) => { - let reserved_file_path = self.table_reserved_file_path(&table_name); - if self - .object_store - .exists(&reserved_file_path) - .await - .unwrap_or(false) - { + // Use the reserved file status from the atomic check + if status.has_reserved_file { + let storage_options = if vend_credentials { + self.get_storage_options_for_table(&table_uri, identity) + .await? + } else { + None + }; Ok(DescribeTableResponse { - version: None, - location: Some(table_uri), - schema: None, - properties: None, - storage_options: self.storage_options.clone(), + table: Some(table_name), + namespace: request.id.as_ref().map(|id| { + if id.len() > 1 { + id[..id.len() - 1].to_vec() + } else { + vec![] + } + }), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), + storage_options, + ..Default::default() }) } else { Err(Error::Namespace { @@ -825,21 +1174,24 @@ impl LanceNamespace for DirectoryNamespace { } let table_name = Self::table_name_from_id(&request.id)?; - let table_path = self.table_path(&table_name); - let table_exists = self - .object_store - .read_dir(table_path) - .await - .map(|entries| !entries.is_empty()) - .unwrap_or(false); - if !table_exists { + // Atomically check table existence and deregistration status + let status = self.check_table_status(&table_name).await; + + if !status.exists { return Err(Error::Namespace { source: format!("Table does not exist: {}", table_name).into(), location: snafu::location!(), }); } + if status.is_deregistered { + return Err(Error::Namespace { + source: format!("Table is deregistered: {}", table_name).into(), + location: snafu::location!(), + }); + } + Ok(()) } @@ -863,8 +1215,7 @@ impl LanceNamespace for DirectoryNamespace { Ok(DropTableResponse { id: request.id, location: Some(table_uri), - properties: None, - transaction_id: None, + ..Default::default() }) } @@ -886,21 +1237,6 @@ impl LanceNamespace for DirectoryNamespace { }); } - // Validate location if provided - if let Some(location) = &request.location { - let location = location.trim_end_matches('/'); - if location != table_uri { - return Err(Error::Namespace { - source: format!( - "Cannot create table {} at location {}, must be at location {}", - table_name, location, table_uri - ) - .into(), - location: snafu::location!(), - }); - } - } - // Parse the Arrow IPC stream from request_data let cursor = Cursor::new(request_data.to_vec()); let stream_reader = StreamReader::try_new(cursor, None).map_err(|e| Error::Namespace { @@ -929,7 +1265,9 @@ impl LanceNamespace for DirectoryNamespace { }; let store_params = self.storage_options.as_ref().map(|opts| ObjectStoreParams { - storage_options: Some(opts.clone()), + storage_options_accessor: Some(Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options(opts.clone()), + )), ..Default::default() }); @@ -950,8 +1288,8 @@ impl LanceNamespace for DirectoryNamespace { Ok(CreateTableResponse { version: Some(1), location: Some(table_uri), - properties: None, storage_options: self.storage_options.clone(), + ..Default::default() }) } @@ -960,7 +1298,20 @@ impl LanceNamespace for DirectoryNamespace { request: CreateEmptyTableRequest, ) -> Result { if let Some(ref manifest_ns) = self.manifest_ns { - return manifest_ns.create_empty_table(request).await; + #[allow(deprecated)] + let mut response = manifest_ns.create_empty_table(request.clone()).await?; + // Only apply identity-based credential vending when explicitly requested + if request.vend_credentials == Some(true) && self.credential_vendor.is_some() { + if let Some(ref location) = response.location { + let identity = request.identity.as_deref(); + response.storage_options = self + .get_storage_options_for_table(location, identity) + .await?; + } + } else if request.vend_credentials == Some(false) { + response.storage_options = None; + } + return Ok(response); } let table_name = Self::table_name_from_id(&request.id)?; @@ -981,35 +1332,107 @@ impl LanceNamespace for DirectoryNamespace { } } - // Create the .lance-reserved file to mark the table as existing + // Atomically create the .lance-reserved file to mark the table as existing. + // This uses put_if_not_exists semantics to avoid race conditions. let reserved_file_path = self.table_reserved_file_path(&table_name); - self.object_store - .create(&reserved_file_path) + self.put_marker_file_atomic(&reserved_file_path, &format!("table {}", table_name)) .await .map_err(|e| Error::Namespace { - source: format!( - "Failed to create .lance-reserved file for table {}: {}", - table_name, e - ) - .into(), + source: e.into(), + location: snafu::location!(), + })?; + + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); + let identity = request.identity.as_deref(); + let storage_options = if vend_credentials { + self.get_storage_options_for_table(&table_uri, identity) + .await? + } else { + None + }; + + Ok(CreateEmptyTableResponse { + location: Some(table_uri), + storage_options, + ..Default::default() + }) + } + + async fn declare_table(&self, request: DeclareTableRequest) -> Result { + if let Some(ref manifest_ns) = self.manifest_ns { + let mut response = manifest_ns.declare_table(request.clone()).await?; + // Only apply identity-based credential vending when explicitly requested + if request.vend_credentials == Some(true) && self.credential_vendor.is_some() { + if let Some(ref location) = response.location { + let identity = request.identity.as_deref(); + response.storage_options = self + .get_storage_options_for_table(location, identity) + .await?; + } + } else if request.vend_credentials == Some(false) { + response.storage_options = None; + } + return Ok(response); + } + + let table_name = Self::table_name_from_id(&request.id)?; + let table_uri = self.table_full_uri(&table_name); + + // Validate location if provided + if let Some(location) = &request.location { + let location = location.trim_end_matches('/'); + if location != table_uri { + return Err(Error::Namespace { + source: format!( + "Cannot declare table {} at location {}, must be at location {}", + table_name, location, table_uri + ) + .into(), + location: snafu::location!(), + }); + } + } + + // Check if table already has data (created via create_table). + // The atomic put only prevents races between concurrent declare_table calls, + // not between declare_table and existing data. + let status = self.check_table_status(&table_name).await; + if status.exists && !status.has_reserved_file { + // Table has data but no reserved file - it was created with data + return Err(Error::Namespace { + source: format!("Table already exists: {}", table_name).into(), location: snafu::location!(), - })? - .shutdown() + }); + } + + // Atomically create the .lance-reserved file to mark the table as declared. + // This uses put_if_not_exists semantics to avoid race conditions between + // concurrent declare_table calls. + let reserved_file_path = self.table_reserved_file_path(&table_name); + + self.put_marker_file_atomic(&reserved_file_path, &format!("table {}", table_name)) .await .map_err(|e| Error::Namespace { - source: format!( - "Failed to finalize .lance-reserved file for table {}: {}", - table_name, e - ) - .into(), + source: e.into(), location: snafu::location!(), })?; - Ok(CreateEmptyTableResponse { + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); + let identity = request.identity.as_deref(); + let storage_options = if vend_credentials { + self.get_storage_options_for_table(&table_uri, identity) + .await? + } else { + None + }; + + Ok(DeclareTableResponse { location: Some(table_uri), - properties: None, - storage_options: self.storage_options.clone(), + storage_options, + ..Default::default() }) } @@ -1038,10 +1461,56 @@ impl LanceNamespace for DirectoryNamespace { return LanceNamespace::deregister_table(manifest_ns.as_ref(), request).await; } - // Without manifest, deregister_table is not supported - Err(Error::NotSupported { - source: "deregister_table is only supported when manifest mode is enabled".into(), - location: snafu::location!(), + // V1 mode: create a .lance-deregistered marker file in the table directory + let table_name = Self::table_name_from_id(&request.id)?; + let table_uri = self.table_full_uri(&table_name); + + // Check table existence and deregistration status. + // This provides better error messages for common cases. + let status = self.check_table_status(&table_name).await; + + if !status.exists { + return Err(Error::Namespace { + source: format!("Table does not exist: {}", table_name).into(), + location: snafu::location!(), + }); + } + + if status.is_deregistered { + return Err(Error::Namespace { + source: format!("Table is already deregistered: {}", table_name).into(), + location: snafu::location!(), + }); + } + + // Atomically create the .lance-deregistered marker file. + // This uses put_if_not_exists semantics to prevent race conditions + // when multiple processes try to deregister the same table concurrently. + // If a race occurs and another process already created the file, + // we'll get an AlreadyExists error which we convert to a proper message. + let deregistered_path = self.table_deregistered_file_path(&table_name); + self.put_marker_file_atomic( + &deregistered_path, + &format!("deregistration marker for table {}", table_name), + ) + .await + .map_err(|e| { + // Convert "already exists" to "already deregistered" for better UX + let message = if e.contains("already exists") { + format!("Table is already deregistered: {}", table_name) + } else { + e + }; + Error::Namespace { + source: message.into(), + location: snafu::location!(), + } + })?; + + Ok(lance_namespace::models::DeregisterTableResponse { + id: request.id, + location: Some(table_uri), + ..Default::default() }) } @@ -1188,28 +1657,6 @@ mod tests { ); } - #[tokio::test] - async fn test_create_table_with_wrong_location() { - let (namespace, _temp_dir) = create_test_namespace().await; - - // Create test IPC data - let schema = create_test_schema(); - let ipc_data = create_test_ipc_data(&schema); - - let mut request = CreateTableRequest::new(); - request.id = Some(vec!["test_table".to_string()]); - request.location = Some("/wrong/path/table.lance".to_string()); - - let result = namespace - .create_table(request, bytes::Bytes::from(ipc_data)) - .await; - assert!(result.is_err()); - assert!(result - .unwrap_err() - .to_string() - .contains("must be at location")); - } - #[tokio::test] async fn test_list_tables() { let (namespace, _temp_dir) = create_test_namespace().await; @@ -1751,6 +2198,7 @@ mod tests { } #[tokio::test] + #[allow(deprecated)] async fn test_create_empty_table() { let (namespace, temp_dir) = create_test_namespace().await; @@ -1795,6 +2243,7 @@ mod tests { } #[tokio::test] + #[allow(deprecated)] async fn test_create_empty_table_with_wrong_location() { let (namespace, _temp_dir) = create_test_namespace().await; @@ -1811,6 +2260,7 @@ mod tests { } #[tokio::test] + #[allow(deprecated)] async fn test_create_empty_table_then_drop() { let (namespace, temp_dir) = create_test_namespace().await; @@ -1859,8 +2309,7 @@ mod tests { // List child namespaces let list_req = ListNamespacesRequest { id: Some(vec![]), - page_token: None, - limit: None, + ..Default::default() }; let result = namespace.list_namespaces(list_req).await; assert!(result.is_ok()); @@ -1892,8 +2341,7 @@ mod tests { // List children of parent let list_req = ListNamespacesRequest { id: Some(vec!["parent".to_string()]), - page_token: None, - limit: None, + ..Default::default() }; let result = namespace.list_namespaces(list_req).await; assert!(result.is_ok()); @@ -1905,8 +2353,7 @@ mod tests { // List root should only show parent let list_req = ListNamespacesRequest { id: Some(vec![]), - page_token: None, - limit: None, + ..Default::default() }; let result = namespace.list_namespaces(list_req).await; assert!(result.is_ok()); @@ -1937,8 +2384,7 @@ mod tests { // List tables in child namespace let list_req = ListTablesRequest { id: Some(vec!["test_ns".to_string()]), - page_token: None, - limit: None, + ..Default::default() }; let result = namespace.list_tables(list_req).await; assert!(result.is_ok()); @@ -1985,8 +2431,7 @@ mod tests { // List tables let list_req = ListTablesRequest { id: Some(vec!["test_ns".to_string()]), - page_token: None, - limit: None, + ..Default::default() }; let result = namespace.list_tables(list_req).await; assert!(result.is_ok()); @@ -2030,6 +2475,7 @@ mod tests { } #[tokio::test] + #[allow(deprecated)] async fn test_empty_table_in_child_namespace() { let (namespace, _temp_dir) = create_test_namespace().await; @@ -2122,6 +2568,7 @@ mod tests { // Describe namespace and verify properties let describe_req = DescribeNamespaceRequest { id: Some(vec!["test_ns".to_string()]), + ..Default::default() }; let result = namespace.describe_namespace(describe_req).await; assert!(result.is_ok()); @@ -2200,6 +2647,7 @@ mod tests { id: Some(vec!["ns1".to_string()]), page_token: None, limit: None, + ..Default::default() }; let result = namespace.list_tables(list_req).await.unwrap(); assert_eq!(result.tables.len(), 1); @@ -2209,6 +2657,7 @@ mod tests { id: Some(vec!["ns2".to_string()]), page_token: None, limit: None, + ..Default::default() }; let result = namespace.list_tables(list_req).await.unwrap(); assert_eq!(result.tables.len(), 1); @@ -2360,7 +2809,7 @@ mod tests { register_req.id = Some(vec!["registered_table".to_string()]); let response = namespace.register_table(register_req).await.unwrap(); - assert_eq!(response.location, "external_table.lance"); + assert_eq!(response.location, Some("external_table.lance".to_string())); // Verify table exists in namespace let mut exists_req = TableExistsRequest::new(); @@ -2543,8 +2992,8 @@ mod tests { } #[tokio::test] - async fn test_register_deregister_without_manifest_fails() { - use lance_namespace::models::{DeregisterTableRequest, RegisterTableRequest}; + async fn test_register_without_manifest_fails() { + use lance_namespace::models::RegisterTableRequest; let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); @@ -2556,7 +3005,7 @@ mod tests { .await .unwrap(); - // Try to register - should fail + // Try to register - should fail (register requires manifest) let mut register_req = RegisterTableRequest::new("test_table.lance".to_string()); register_req.id = Some(vec!["test_table".to_string()]); let result = namespace.register_table(register_req).await; @@ -2566,15 +3015,8 @@ mod tests { .to_string() .contains("manifest mode is enabled")); - // Try to deregister - should fail - let mut deregister_req = DeregisterTableRequest::new(); - deregister_req.id = Some(vec!["test_table".to_string()]); - let result = namespace.deregister_table(deregister_req).await; - assert!(result.is_err()); - assert!(result - .unwrap_err() - .to_string() - .contains("manifest mode is enabled")); + // Note: deregister_table now works in V1 mode via .lance-deregistered marker files + // See test_deregister_table_v1_mode for that test case } #[tokio::test] @@ -2669,15 +3111,10 @@ mod tests { .unwrap(); let reader1 = RecordBatchIterator::new(vec![data1].into_iter().map(Ok), schema.clone()); - let dataset = Dataset::write_into_namespace( - reader1, - namespace.clone(), - table_id.clone(), - None, - false, - ) - .await - .unwrap(); + let dataset = + Dataset::write_into_namespace(reader1, namespace.clone(), table_id.clone(), None) + .await + .unwrap(); assert_eq!(dataset.count_rows(None).await.unwrap(), 3); assert_eq!(dataset.version().version, 1); @@ -2703,7 +3140,6 @@ mod tests { namespace.clone(), table_id.clone(), Some(params_append), - false, ) .await .unwrap(); @@ -2732,7 +3168,6 @@ mod tests { namespace.clone(), table_id.clone(), Some(params_overwrite), - false, ) .await .unwrap(); @@ -2750,4 +3185,372 @@ mod tests { .unwrap(); assert_eq!(a_col.values(), &[100, 200]); } + + // ============================================================ + // Tests for declare_table + // ============================================================ + + #[tokio::test] + async fn test_declare_table_v1_mode() { + use lance_namespace::models::{ + DeclareTableRequest, DescribeTableRequest, TableExistsRequest, + }; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + // Create namespace in V1 mode (no manifest) + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(false) + .build() + .await + .unwrap(); + + // Declare a table + let mut declare_req = DeclareTableRequest::new(); + declare_req.id = Some(vec!["test_table".to_string()]); + let response = namespace.declare_table(declare_req).await.unwrap(); + + // Should return location + assert!(response.location.is_some()); + let location = response.location.as_ref().unwrap(); + assert!(location.ends_with("test_table.lance")); + + // Table should exist (via reserved file) + let mut exists_req = TableExistsRequest::new(); + exists_req.id = Some(vec!["test_table".to_string()]); + assert!(namespace.table_exists(exists_req).await.is_ok()); + + // Describe should work but return no version/schema (not written yet) + let mut describe_req = DescribeTableRequest::new(); + describe_req.id = Some(vec!["test_table".to_string()]); + let describe_response = namespace.describe_table(describe_req).await.unwrap(); + assert!(describe_response.location.is_some()); + assert!(describe_response.version.is_none()); // Not written yet + assert!(describe_response.schema.is_none()); // Not written yet + } + + #[tokio::test] + async fn test_declare_table_with_manifest() { + use lance_namespace::models::{DeclareTableRequest, TableExistsRequest}; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + // Create namespace with manifest + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(true) + .dir_listing_enabled(false) + .build() + .await + .unwrap(); + + // Declare a table + let mut declare_req = DeclareTableRequest::new(); + declare_req.id = Some(vec!["test_table".to_string()]); + let response = namespace.declare_table(declare_req).await.unwrap(); + + // Should return location + assert!(response.location.is_some()); + + // Table should exist in manifest + let mut exists_req = TableExistsRequest::new(); + exists_req.id = Some(vec!["test_table".to_string()]); + assert!(namespace.table_exists(exists_req).await.is_ok()); + } + + #[tokio::test] + async fn test_declare_table_when_table_exists() { + use lance_namespace::models::DeclareTableRequest; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(false) + .build() + .await + .unwrap(); + + // First create a table with actual data + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_req, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Try to declare the same table - should fail because it already has data + let mut declare_req = DeclareTableRequest::new(); + declare_req.id = Some(vec!["test_table".to_string()]); + let result = namespace.declare_table(declare_req).await; + assert!(result.is_err()); + } + + // ============================================================ + // Tests for deregister_table in V1 mode + // ============================================================ + + #[tokio::test] + async fn test_deregister_table_v1_mode() { + use lance_namespace::models::{DeregisterTableRequest, TableExistsRequest}; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + // Create namespace in V1 mode (no manifest, with dir listing) + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(false) + .dir_listing_enabled(true) + .build() + .await + .unwrap(); + + // Create a table with data + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_req, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Verify table exists + let mut exists_req = TableExistsRequest::new(); + exists_req.id = Some(vec!["test_table".to_string()]); + assert!(namespace.table_exists(exists_req.clone()).await.is_ok()); + + // Deregister the table + let mut deregister_req = DeregisterTableRequest::new(); + deregister_req.id = Some(vec!["test_table".to_string()]); + let response = namespace.deregister_table(deregister_req).await.unwrap(); + + // Should return location + assert!(response.location.is_some()); + let location = response.location.as_ref().unwrap(); + assert!(location.contains("test_table")); + + // Table should no longer exist (deregistered) + let result = namespace.table_exists(exists_req).await; + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("deregistered")); + + // Physical data should still exist + let dataset = Dataset::open(location).await; + assert!(dataset.is_ok(), "Physical table data should still exist"); + } + + #[tokio::test] + async fn test_deregister_table_v1_already_deregistered() { + use lance_namespace::models::DeregisterTableRequest; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(false) + .dir_listing_enabled(true) + .build() + .await + .unwrap(); + + // Create a table + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_req, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Deregister once + let mut deregister_req = DeregisterTableRequest::new(); + deregister_req.id = Some(vec!["test_table".to_string()]); + namespace + .deregister_table(deregister_req.clone()) + .await + .unwrap(); + + // Try to deregister again - should fail + let result = namespace.deregister_table(deregister_req).await; + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("already deregistered")); + } + + // ============================================================ + // Tests for list_tables skipping deregistered tables + // ============================================================ + + #[tokio::test] + async fn test_list_tables_skips_deregistered_v1() { + use lance_namespace::models::DeregisterTableRequest; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(false) + .dir_listing_enabled(true) + .build() + .await + .unwrap(); + + // Create two tables + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + + let mut create_req1 = CreateTableRequest::new(); + create_req1.id = Some(vec!["table1".to_string()]); + namespace + .create_table(create_req1, bytes::Bytes::from(ipc_data.clone())) + .await + .unwrap(); + + let mut create_req2 = CreateTableRequest::new(); + create_req2.id = Some(vec!["table2".to_string()]); + namespace + .create_table(create_req2, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // List tables - should see both (root namespace = empty vec) + let mut list_req = ListTablesRequest::new(); + list_req.id = Some(vec![]); + let list_response = namespace.list_tables(list_req.clone()).await.unwrap(); + assert_eq!(list_response.tables.len(), 2); + + // Deregister table1 + let mut deregister_req = DeregisterTableRequest::new(); + deregister_req.id = Some(vec!["table1".to_string()]); + namespace.deregister_table(deregister_req).await.unwrap(); + + // List tables - should only see table2 + let list_response = namespace.list_tables(list_req).await.unwrap(); + assert_eq!(list_response.tables.len(), 1); + assert!(list_response.tables.contains(&"table2".to_string())); + assert!(!list_response.tables.contains(&"table1".to_string())); + } + + // ============================================================ + // Tests for describe_table and table_exists with deregistered tables + // ============================================================ + + #[tokio::test] + async fn test_describe_table_fails_for_deregistered_v1() { + use lance_namespace::models::{DeregisterTableRequest, DescribeTableRequest}; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(false) + .dir_listing_enabled(true) + .build() + .await + .unwrap(); + + // Create a table + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_req, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Describe should work before deregistration + let mut describe_req = DescribeTableRequest::new(); + describe_req.id = Some(vec!["test_table".to_string()]); + assert!(namespace.describe_table(describe_req.clone()).await.is_ok()); + + // Deregister + let mut deregister_req = DeregisterTableRequest::new(); + deregister_req.id = Some(vec!["test_table".to_string()]); + namespace.deregister_table(deregister_req).await.unwrap(); + + // Describe should fail after deregistration + let result = namespace.describe_table(describe_req).await; + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("deregistered")); + } + + #[tokio::test] + async fn test_table_exists_fails_for_deregistered_v1() { + use lance_namespace::models::{DeregisterTableRequest, TableExistsRequest}; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(false) + .dir_listing_enabled(true) + .build() + .await + .unwrap(); + + // Create a table + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_req, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Table exists should work before deregistration + let mut exists_req = TableExistsRequest::new(); + exists_req.id = Some(vec!["test_table".to_string()]); + assert!(namespace.table_exists(exists_req.clone()).await.is_ok()); + + // Deregister + let mut deregister_req = DeregisterTableRequest::new(); + deregister_req.id = Some(vec!["test_table".to_string()]); + namespace.deregister_table(deregister_req).await.unwrap(); + + // Table exists should fail after deregistration + let result = namespace.table_exists(exists_req).await; + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("deregistered")); + } + + #[tokio::test] + async fn test_atomic_table_status_check() { + // This test verifies that the TableStatus check is atomic + // by ensuring a single directory listing is used + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(false) + .dir_listing_enabled(true) + .build() + .await + .unwrap(); + + // Create a table + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_req, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Table status should show exists=true, is_deregistered=false + let status = namespace.check_table_status("test_table").await; + assert!(status.exists); + assert!(!status.is_deregistered); + assert!(!status.has_reserved_file); + } } diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index ab6bb6fa78a..49d19712e26 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -24,12 +24,13 @@ use lance_index::IndexType; use lance_io::object_store::{ObjectStore, ObjectStoreParams}; use lance_namespace::models::{ CreateEmptyTableRequest, CreateEmptyTableResponse, CreateNamespaceRequest, - CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, DeregisterTableRequest, - DeregisterTableResponse, DescribeNamespaceRequest, DescribeNamespaceResponse, - DescribeTableRequest, DescribeTableResponse, DropNamespaceRequest, DropNamespaceResponse, - DropTableRequest, DropTableResponse, ListNamespacesRequest, ListNamespacesResponse, - ListTablesRequest, ListTablesResponse, NamespaceExistsRequest, RegisterTableRequest, - RegisterTableResponse, TableExistsRequest, + CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, DeclareTableRequest, + DeclareTableResponse, DeregisterTableRequest, DeregisterTableResponse, + DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableRequest, + DescribeTableResponse, DropNamespaceRequest, DropNamespaceResponse, DropTableRequest, + DropTableResponse, ListNamespacesRequest, ListNamespacesResponse, ListTablesRequest, + ListTablesResponse, NamespaceExistsRequest, RegisterTableRequest, RegisterTableResponse, + TableExistsRequest, }; use lance_namespace::schema::arrow_schema_to_json; use lance_namespace::LanceNamespace; @@ -981,7 +982,11 @@ impl ManifestNamespace { let write_params = WriteParams { session, store_params: storage_options.as_ref().map(|opts| ObjectStoreParams { - storage_options: Some(opts.clone()), + storage_options_accessor: Some(Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options( + opts.clone(), + ), + )), ..Default::default() }), ..Default::default() @@ -1078,11 +1083,41 @@ impl LanceNamespace for ManifestNamespace { let object_id = Self::str_object_id(table_id); let table_info = self.query_manifest_for_table(&object_id).await?; + // Extract table name and namespace from table_id + let table_name = table_id.last().cloned().unwrap_or_default(); + let namespace_id: Vec = if table_id.len() > 1 { + table_id[..table_id.len() - 1].to_vec() + } else { + vec![] + }; + + let load_detailed_metadata = request.load_detailed_metadata.unwrap_or(false); + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); + match table_info { Some(info) => { // Construct full URI from relative location let table_uri = Self::construct_full_uri(&self.root, &info.location)?; + let storage_options = if vend_credentials { + self.storage_options.clone() + } else { + None + }; + + // If not loading detailed metadata, return minimal response with just location + if !load_detailed_metadata { + return Ok(DescribeTableResponse { + table: Some(table_name), + namespace: Some(namespace_id), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), + storage_options, + ..Default::default() + }); + } + // Try to open the dataset to get version and schema match Dataset::open(&table_uri).await { Ok(mut dataset) => { @@ -1097,21 +1132,25 @@ impl LanceNamespace for ManifestNamespace { let json_schema = arrow_schema_to_json(&arrow_schema)?; Ok(DescribeTableResponse { + table: Some(table_name.clone()), + namespace: Some(namespace_id.clone()), version: Some(version as i64), - location: Some(table_uri), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), schema: Some(Box::new(json_schema)), - properties: None, - storage_options: self.storage_options.clone(), + storage_options, + ..Default::default() }) } Err(_) => { // If dataset can't be opened (e.g., empty table), return minimal info Ok(DescribeTableResponse { - version: None, - location: Some(table_uri), - schema: None, - properties: None, - storage_options: self.storage_options.clone(), + table: Some(table_name), + namespace: Some(namespace_id), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), + storage_options, + ..Default::default() }) } } @@ -1197,21 +1236,6 @@ impl LanceNamespace for ManifestNamespace { }); } - // Validate location if provided - if let Some(location) = &request.location { - let location = location.trim_end_matches('/'); - if location != table_uri { - return Err(Error::Namespace { - source: format!( - "Cannot create table {} at location {}, must be at location {}", - table_name, location, table_uri - ) - .into(), - location: location!(), - }); - } - } - // Write the data using Lance Dataset let cursor = Cursor::new(data.to_vec()); let stream_reader = StreamReader::try_new(cursor, None) @@ -1252,8 +1276,8 @@ impl LanceNamespace for ManifestNamespace { Ok(CreateTableResponse { version: Some(1), location: Some(table_uri), - properties: None, storage_options: self.storage_options.clone(), + ..Default::default() }) } @@ -1297,8 +1321,7 @@ impl LanceNamespace for ManifestNamespace { Ok(DropTableResponse { id: request.id.clone(), location: Some(table_uri), - properties: None, - transaction_id: None, + ..Default::default() }) } None => Err(Error::Namespace { @@ -1370,8 +1393,10 @@ impl LanceNamespace for ManifestNamespace { // Root namespace always exists if namespace_id.is_empty() { + #[allow(clippy::needless_update)] return Ok(DescribeNamespaceResponse { properties: Some(HashMap::new()), + ..Default::default() }); } @@ -1380,8 +1405,10 @@ impl LanceNamespace for ManifestNamespace { let namespace_info = self.query_manifest_for_namespace(&object_id).await?; match namespace_info { + #[allow(clippy::needless_update)] Some(info) => Ok(DescribeNamespaceResponse { properties: info.metadata, + ..Default::default() }), None => Err(Error::Namespace { source: format!("Namespace '{}' not found", object_id).into(), @@ -1441,6 +1468,7 @@ impl LanceNamespace for ManifestNamespace { Ok(CreateNamespaceResponse { properties: request.properties, + ..Default::default() }) } @@ -1502,10 +1530,7 @@ impl LanceNamespace for ManifestNamespace { self.delete_from_manifest(&object_id).await?; - Ok(DropNamespaceResponse { - properties: None, - transaction_id: None, - }) + Ok(DropNamespaceResponse::default()) } async fn namespace_exists(&self, request: NamespaceExistsRequest) -> Result<()> { @@ -1621,10 +1646,121 @@ impl LanceNamespace for ManifestNamespace { table_uri ); + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); + let storage_options = if vend_credentials { + self.storage_options.clone() + } else { + None + }; + Ok(CreateEmptyTableResponse { location: Some(table_uri), - properties: None, - storage_options: self.storage_options.clone(), + storage_options, + ..Default::default() + }) + } + + async fn declare_table(&self, request: DeclareTableRequest) -> Result { + let table_id = request.id.as_ref().ok_or_else(|| Error::InvalidInput { + source: "Table ID is required".into(), + location: location!(), + })?; + + if table_id.is_empty() { + return Err(Error::InvalidInput { + source: "Table ID cannot be empty".into(), + location: location!(), + }); + } + + let (namespace, table_name) = Self::split_object_id(table_id); + let object_id = Self::build_object_id(&namespace, &table_name); + + // Check if table already exists in manifest + let existing = self.query_manifest_for_table(&object_id).await?; + if existing.is_some() { + return Err(Error::Namespace { + source: format!("Table '{}' already exists", table_name).into(), + location: location!(), + }); + } + + // Create table location path with hash-based naming + // When dir_listing_enabled is true and it's a root table, use directory-style naming: {table_name}.lance + // Otherwise, use hash-based naming: {hash}_{object_id} + let dir_name = if namespace.is_empty() && self.dir_listing_enabled { + // Root table with directory listing enabled: use {table_name}.lance + format!("{}.lance", table_name) + } else { + // Child namespace table or dir listing disabled: use hash-based naming + Self::generate_dir_name(&object_id) + }; + let table_path = self.base_path.child(dir_name.as_str()); + let table_uri = Self::construct_full_uri(&self.root, &dir_name)?; + + // Validate location if provided + if let Some(req_location) = &request.location { + let req_location = req_location.trim_end_matches('/'); + if req_location != table_uri { + return Err(Error::Namespace { + source: format!( + "Cannot declare table {} at location {}, must be at location {}", + table_name, req_location, table_uri + ) + .into(), + location: location!(), + }); + } + } + + // Create the .lance-reserved file to mark the table as existing + let reserved_file_path = table_path.child(".lance-reserved"); + + self.object_store + .create(&reserved_file_path) + .await + .map_err(|e| Error::Namespace { + source: format!( + "Failed to create .lance-reserved file for table {}: {}", + table_name, e + ) + .into(), + location: location!(), + })? + .shutdown() + .await + .map_err(|e| Error::Namespace { + source: format!( + "Failed to finalize .lance-reserved file for table {}: {}", + table_name, e + ) + .into(), + location: location!(), + })?; + + // Add entry to manifest marking this as a declared table (store dir_name, not full path) + self.insert_into_manifest(object_id, ObjectType::Table, Some(dir_name)) + .await?; + + log::info!( + "Declared table '{}' in manifest at {}", + table_name, + table_uri + ); + + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); + let storage_options = if vend_credentials { + self.storage_options.clone() + } else { + None + }; + + Ok(DeclareTableResponse { + location: Some(table_uri), + storage_options, + ..Default::default() }) } @@ -1697,8 +1833,8 @@ impl LanceNamespace for ManifestNamespace { .await?; Ok(RegisterTableResponse { - location, - properties: None, + location: Some(location), + ..Default::default() }) } @@ -1741,7 +1877,7 @@ impl LanceNamespace for ManifestNamespace { Ok(DeregisterTableResponse { id: request.id.clone(), location: Some(table_uri), - properties: None, + ..Default::default() }) } } @@ -2169,6 +2305,7 @@ mod tests { // Verify namespace exists let exists_req = NamespaceExistsRequest { id: Some(vec!["ns1".to_string()]), + ..Default::default() }; let result = dir_namespace.namespace_exists(exists_req).await; assert!(result.is_ok(), "Namespace should exist"); @@ -2178,6 +2315,7 @@ mod tests { id: Some(vec![]), page_token: None, limit: None, + ..Default::default() }; let result = dir_namespace.list_namespaces(list_req).await; assert!(result.is_ok()); @@ -2222,6 +2360,7 @@ mod tests { // Verify nested namespace exists let exists_req = NamespaceExistsRequest { id: Some(vec!["parent".to_string(), "child".to_string()]), + ..Default::default() }; let result = dir_namespace.namespace_exists(exists_req).await; assert!(result.is_ok(), "Nested namespace should exist"); @@ -2231,6 +2370,7 @@ mod tests { id: Some(vec!["parent".to_string()]), page_token: None, limit: None, + ..Default::default() }; let result = dir_namespace.list_namespaces(list_req).await; assert!(result.is_ok()); @@ -2298,6 +2438,7 @@ mod tests { // Verify namespace no longer exists let exists_req = NamespaceExistsRequest { id: Some(vec!["ns1".to_string()]), + ..Default::default() }; let result = dir_namespace.namespace_exists(exists_req).await; assert!(result.is_err(), "Namespace should not exist after drop"); @@ -2376,6 +2517,7 @@ mod tests { id: Some(vec!["ns1".to_string()]), page_token: None, limit: None, + ..Default::default() }; let result = dir_namespace.list_tables(list_req).await; assert!(result.is_ok()); @@ -2412,6 +2554,7 @@ mod tests { // Describe the namespace let describe_req = DescribeNamespaceRequest { id: Some(vec!["ns1".to_string()]), + ..Default::default() }; let result = dir_namespace.describe_namespace(describe_req).await; assert!( diff --git a/rust/lance-namespace-impls/src/lib.rs b/rust/lance-namespace-impls/src/lib.rs index 634199ce98a..83fb93ddc0e 100644 --- a/rust/lance-namespace-impls/src/lib.rs +++ b/rust/lance-namespace-impls/src/lib.rs @@ -10,12 +10,49 @@ //! - `rest`: REST API-based namespace implementation //! - `rest-adapter`: REST server adapter that exposes any namespace via HTTP //! - `dir-aws`, `dir-azure`, `dir-gcp`, `dir-oss`: Cloud storage backend support for directory namespace (via lance-io) +//! - `credential-vendor-aws`, `credential-vendor-gcp`, `credential-vendor-azure`: Credential vending for cloud storage //! //! ## Implementations //! //! - `DirectoryNamespace`: Directory-based implementation (always available) //! - `RestNamespace`: REST API-based implementation (requires `rest` feature) //! +//! ## Credential Vending +//! +//! The `credentials` module provides temporary credential vending for cloud storage: +//! - AWS: STS AssumeRole with scoped IAM policies (requires `credential-vendor-aws` feature) +//! - GCP: OAuth2 tokens with access boundaries (requires `credential-vendor-gcp` feature) +//! - Azure: SAS tokens with user delegation keys (requires `credential-vendor-azure` feature) +//! +//! The credential vendor is automatically selected based on the table location URI scheme: +//! - `s3://` for AWS +//! - `gs://` for GCP +//! - `az://` for Azure +//! +//! Configuration properties (prefixed with `credential_vendor.`, prefix is stripped): +//! +//! ```text +//! # Required to enable credential vending +//! credential_vendor.enabled = "true" +//! +//! # Common properties (apply to all providers) +//! credential_vendor.permission = "read" # read, write, or admin (default: read) +//! +//! # AWS-specific properties (for s3:// locations) +//! credential_vendor.aws_role_arn = "arn:aws:iam::123456789012:role/MyRole" # required for AWS +//! credential_vendor.aws_duration_millis = "3600000" # 1 hour (default, range: 15min-12hrs) +//! +//! # GCP-specific properties (for gs:// locations) +//! # Note: GCP uses ADC; set GOOGLE_APPLICATION_CREDENTIALS env var for service account key +//! # Note: GCP token duration cannot be configured; it's determined by the STS endpoint +//! credential_vendor.gcp_service_account = "my-sa@project.iam.gserviceaccount.com" +//! +//! # Azure-specific properties (for az:// locations) +//! credential_vendor.azure_account_name = "mystorageaccount" # required for Azure +//! credential_vendor.azure_tenant_id = "my-tenant-id" +//! credential_vendor.azure_duration_millis = "3600000" # 1 hour (default, up to 7 days) +//! ``` +//! //! ## Usage //! //! The recommended way to connect to a namespace is using [`ConnectBuilder`]: @@ -32,6 +69,8 @@ //! ``` pub mod connect; +pub mod context; +pub mod credentials; pub mod dir; #[cfg(feature = "rest")] @@ -42,8 +81,30 @@ pub mod rest_adapter; // Re-export connect builder pub use connect::ConnectBuilder; +pub use context::{DynamicContextProvider, OperationInfo}; pub use dir::{manifest::ManifestNamespace, DirectoryNamespace, DirectoryNamespaceBuilder}; +// Re-export credential vending +pub use credentials::{ + create_credential_vendor_for_location, detect_provider_from_uri, has_credential_vendor_config, + redact_credential, CredentialVendor, VendedCredentials, DEFAULT_CREDENTIAL_DURATION_MILLIS, +}; + +#[cfg(feature = "credential-vendor-aws")] +pub use credentials::aws::{AwsCredentialVendor, AwsCredentialVendorConfig}; +#[cfg(feature = "credential-vendor-aws")] +pub use credentials::aws_props; + +#[cfg(feature = "credential-vendor-gcp")] +pub use credentials::gcp::{GcpCredentialVendor, GcpCredentialVendorConfig}; +#[cfg(feature = "credential-vendor-gcp")] +pub use credentials::gcp_props; + +#[cfg(feature = "credential-vendor-azure")] +pub use credentials::azure::{AzureCredentialVendor, AzureCredentialVendorConfig}; +#[cfg(feature = "credential-vendor-azure")] +pub use credentials::azure_props; + #[cfg(feature = "rest")] pub use rest::{RestNamespace, RestNamespaceBuilder}; diff --git a/rust/lance-namespace-impls/src/rest.rs b/rust/lance-namespace-impls/src/rest.rs index 1f7ee341d26..0eae07e4ce2 100644 --- a/rust/lance-namespace-impls/src/rest.rs +++ b/rust/lance-namespace-impls/src/rest.rs @@ -4,33 +4,137 @@ //! REST implementation of Lance Namespace use std::collections::HashMap; +use std::str::FromStr; +use std::sync::Arc; use async_trait::async_trait; use bytes::Bytes; +use reqwest::header::{HeaderName, HeaderValue}; -use lance_namespace::apis::{ - configuration::Configuration, namespace_api, table_api, transaction_api, -}; +use crate::context::{DynamicContextProvider, OperationInfo}; + +use lance_namespace::apis::urlencode; use lance_namespace::models::{ - AlterTransactionRequest, AlterTransactionResponse, CountTableRowsRequest, - CreateEmptyTableRequest, CreateEmptyTableResponse, CreateNamespaceRequest, - CreateNamespaceResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, - CreateTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, DeregisterTableRequest, - DeregisterTableResponse, DescribeNamespaceRequest, DescribeNamespaceResponse, - DescribeTableIndexStatsRequest, DescribeTableIndexStatsResponse, DescribeTableRequest, - DescribeTableResponse, DescribeTransactionRequest, DescribeTransactionResponse, - DropNamespaceRequest, DropNamespaceResponse, DropTableRequest, DropTableResponse, - InsertIntoTableRequest, InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, - ListTableIndicesRequest, ListTableIndicesResponse, ListTablesRequest, ListTablesResponse, + AlterTableAddColumnsRequest, AlterTableAddColumnsResponse, AlterTableAlterColumnsRequest, + AlterTableAlterColumnsResponse, AlterTableDropColumnsRequest, AlterTableDropColumnsResponse, + AlterTransactionRequest, AlterTransactionResponse, AnalyzeTableQueryPlanRequest, + CountTableRowsRequest, CreateEmptyTableRequest, CreateEmptyTableResponse, + CreateNamespaceRequest, CreateNamespaceResponse, CreateTableIndexRequest, + CreateTableIndexResponse, CreateTableRequest, CreateTableResponse, + CreateTableScalarIndexResponse, CreateTableTagRequest, CreateTableTagResponse, + DeclareTableRequest, DeclareTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, + DeleteTableTagRequest, DeleteTableTagResponse, DeregisterTableRequest, DeregisterTableResponse, + DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableIndexStatsRequest, + DescribeTableIndexStatsResponse, DescribeTableRequest, DescribeTableResponse, + DescribeTransactionRequest, DescribeTransactionResponse, DropNamespaceRequest, + DropNamespaceResponse, DropTableIndexRequest, DropTableIndexResponse, DropTableRequest, + DropTableResponse, ExplainTableQueryPlanRequest, GetTableStatsRequest, GetTableStatsResponse, + GetTableTagVersionRequest, GetTableTagVersionResponse, InsertIntoTableRequest, + InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, + ListTableIndicesRequest, ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, + ListTableVersionsRequest, ListTableVersionsResponse, ListTablesRequest, ListTablesResponse, MergeInsertIntoTableRequest, MergeInsertIntoTableResponse, NamespaceExistsRequest, - QueryTableRequest, RegisterTableRequest, RegisterTableResponse, TableExistsRequest, - UpdateTableRequest, UpdateTableResponse, + QueryTableRequest, RegisterTableRequest, RegisterTableResponse, RenameTableRequest, + RenameTableResponse, RestoreTableRequest, RestoreTableResponse, TableExistsRequest, + UpdateTableRequest, UpdateTableResponse, UpdateTableSchemaMetadataRequest, + UpdateTableSchemaMetadataResponse, UpdateTableTagRequest, UpdateTableTagResponse, }; +use serde::{de::DeserializeOwned, Serialize}; use lance_core::{box_error, Error, Result}; use lance_namespace::LanceNamespace; +/// HTTP client wrapper that supports per-request header injection. +/// +/// This client wraps a single `reqwest::Client` and applies dynamic headers +/// to each request without recreating the client. This is more efficient than +/// creating a new client per request when using a `DynamicContextProvider`. +/// +/// The design follows lancedb's `RestfulLanceDbClient` pattern where headers +/// are applied to the built request using `headers_mut()` before execution. +#[derive(Clone)] +struct RestClient { + client: reqwest::Client, + base_path: String, + base_headers: HashMap, + context_provider: Option>, +} + +impl std::fmt::Debug for RestClient { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("RestClient") + .field("base_path", &self.base_path) + .field("base_headers", &self.base_headers) + .field( + "context_provider", + &self.context_provider.as_ref().map(|_| "Some(...)"), + ) + .finish() + } +} + +impl RestClient { + /// Apply base headers and dynamic context headers to a request. + /// + /// This method mutates the request's headers directly, which is more efficient + /// than creating a new client with default_headers for each request. + fn apply_headers(&self, request: &mut reqwest::Request, operation: &str, object_id: &str) { + let request_headers = request.headers_mut(); + + // First apply base headers + for (key, value) in &self.base_headers { + if let (Ok(header_name), Ok(header_value)) = + (HeaderName::from_str(key), HeaderValue::from_str(value)) + { + request_headers.insert(header_name, header_value); + } + } + + // Then apply context headers (override base headers if conflict) + if let Some(provider) = &self.context_provider { + let info = OperationInfo::new(operation, object_id); + let context = provider.provide_context(&info); + + const HEADERS_PREFIX: &str = "headers."; + for (key, value) in context { + if let Some(header_name) = key.strip_prefix(HEADERS_PREFIX) { + if let (Ok(header_name), Ok(header_value)) = ( + HeaderName::from_str(header_name), + HeaderValue::from_str(&value), + ) { + request_headers.insert(header_name, header_value); + } + } + } + } + } + + /// Execute a request with dynamic headers applied. + /// + /// This method builds the request, applies headers, and executes it. + async fn execute( + &self, + req_builder: reqwest::RequestBuilder, + operation: &str, + object_id: &str, + ) -> std::result::Result { + let mut request = req_builder.build()?; + self.apply_headers(&mut request, operation, object_id); + self.client.execute(request).await + } + + /// Get the base path URL + fn base_path(&self) -> &str { + &self.base_path + } + + /// Get a reference to the underlying reqwest client + fn client(&self) -> &reqwest::Client { + &self.client + } +} + /// Builder for creating a RestNamespace. /// /// This builder provides a fluent API for configuring and establishing @@ -49,7 +153,7 @@ use lance_namespace::LanceNamespace; /// # Ok(()) /// # } /// ``` -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct RestNamespaceBuilder { uri: String, delimiter: String, @@ -58,6 +162,25 @@ pub struct RestNamespaceBuilder { key_file: Option, ssl_ca_cert: Option, assert_hostname: bool, + context_provider: Option>, +} + +impl std::fmt::Debug for RestNamespaceBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("RestNamespaceBuilder") + .field("uri", &self.uri) + .field("delimiter", &self.delimiter) + .field("headers", &self.headers) + .field("cert_file", &self.cert_file) + .field("key_file", &self.key_file) + .field("ssl_ca_cert", &self.ssl_ca_cert) + .field("assert_hostname", &self.assert_hostname) + .field( + "context_provider", + &self.context_provider.as_ref().map(|_| "Some(...)"), + ) + .finish() + } } impl RestNamespaceBuilder { @@ -78,6 +201,7 @@ impl RestNamespaceBuilder { key_file: None, ssl_ca_cert: None, assert_hostname: true, + context_provider: None, } } @@ -162,6 +286,7 @@ impl RestNamespaceBuilder { key_file, ssl_ca_cert, assert_hostname, + context_provider: None, }) } @@ -236,6 +361,44 @@ impl RestNamespaceBuilder { self } + /// Set a dynamic context provider for per-request context. + /// + /// The provider will be called before each HTTP request to generate + /// additional context. Context keys that start with `headers.` are converted + /// to HTTP headers by stripping the prefix. For example, `headers.Authorization` + /// becomes the `Authorization` header. Keys without the `headers.` prefix are ignored. + /// + /// # Arguments + /// + /// * `provider` - The context provider implementation + /// + /// # Examples + /// + /// ```ignore + /// use lance_namespace_impls::{RestNamespaceBuilder, DynamicContextProvider, OperationInfo}; + /// use std::collections::HashMap; + /// use std::sync::Arc; + /// + /// #[derive(Debug)] + /// struct MyProvider; + /// + /// impl DynamicContextProvider for MyProvider { + /// fn provide_context(&self, info: &OperationInfo) -> HashMap { + /// let mut ctx = HashMap::new(); + /// ctx.insert("auth-token".to_string(), "my-token".to_string()); + /// ctx + /// } + /// } + /// + /// let namespace = RestNamespaceBuilder::new("http://localhost:8080") + /// .context_provider(Arc::new(MyProvider)) + /// .build(); + /// ``` + pub fn context_provider(mut self, provider: Arc) -> Self { + self.context_provider = Some(provider); + self + } + /// Build the RestNamespace. /// /// # Returns @@ -258,29 +421,6 @@ fn object_id_str(id: &Option>, delimiter: &str) -> Result { } } -/// Convert API error to lance core error -fn convert_api_error(err: lance_namespace::apis::Error) -> Error { - use lance_namespace::apis::Error as ApiError; - match err { - ApiError::Reqwest(e) => Error::IO { - source: box_error(e), - location: snafu::location!(), - }, - ApiError::Serde(e) => Error::Namespace { - source: format!("Serialization error: {}", e).into(), - location: snafu::location!(), - }, - ApiError::Io(e) => Error::IO { - source: box_error(e), - location: snafu::location!(), - }, - ApiError::ResponseError(e) => Error::Namespace { - source: format!("Response error: {:?}", e).into(), - location: snafu::location!(), - }, - } -} - /// REST implementation of Lance Namespace /// /// # Examples @@ -297,7 +437,8 @@ fn convert_api_error(err: lance_namespace::apis::Error) - #[derive(Clone)] pub struct RestNamespace { delimiter: String, - reqwest_config: Configuration, + /// REST client that handles per-request header injection efficiently. + rest_client: RestClient, } impl std::fmt::Debug for RestNamespace { @@ -315,23 +456,9 @@ impl std::fmt::Display for RestNamespace { impl RestNamespace { /// Create a new REST namespace from builder pub(crate) fn from_builder(builder: RestNamespaceBuilder) -> Self { - // Build reqwest client with custom headers if provided + // Build reqwest client WITHOUT default headers - we'll apply headers per-request let mut client_builder = reqwest::Client::builder(); - // Add custom headers to the client - if !builder.headers.is_empty() { - let mut headers = reqwest::header::HeaderMap::new(); - for (key, value) in &builder.headers { - if let (Ok(header_name), Ok(header_value)) = ( - reqwest::header::HeaderName::from_bytes(key.as_bytes()), - reqwest::header::HeaderValue::from_str(value), - ) { - headers.insert(header_name, header_value); - } - } - client_builder = client_builder.default_headers(headers); - } - // Configure mTLS if certificate and key files are provided if let (Some(cert_file), Some(key_file)) = (&builder.cert_file, &builder.key_file) { if let (Ok(cert), Ok(key)) = (std::fs::read(cert_file), std::fs::read(key_file)) { @@ -357,28 +484,218 @@ impl RestNamespace { .build() .unwrap_or_else(|_| reqwest::Client::new()); - let mut reqwest_config = Configuration::new(); - reqwest_config.client = client; - reqwest_config.base_path = builder.uri; + // Create the RestClient that handles per-request header injection + let rest_client = RestClient { + client, + base_path: builder.uri, + base_headers: builder.headers, + context_provider: builder.context_provider, + }; Self { delimiter: builder.delimiter, - reqwest_config, + rest_client, } } - /// Create a new REST namespace with custom configuration (for testing) - #[cfg(test)] - pub fn with_configuration(delimiter: String, reqwest_config: Configuration) -> Self { - Self { - delimiter, - reqwest_config, + /// Execute a GET request and parse JSON response. + async fn get_json( + &self, + path: &str, + query: &[(&str, &str)], + operation: &str, + object_id: &str, + ) -> Result { + let url = format!("{}{}", self.rest_client.base_path(), path); + let req_builder = self.rest_client.client().get(&url).query(query); + + let resp = self + .rest_client + .execute(req_builder, operation, object_id) + .await + .map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + + let status = resp.status(); + let content = resp.text().await.map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + + if status.is_success() { + serde_json::from_str(&content).map_err(|e| Error::Namespace { + source: format!("Failed to parse response: {}", e).into(), + location: snafu::location!(), + }) + } else { + Err(Error::Namespace { + source: format!("Response error: status={}, content={}", status, content).into(), + location: snafu::location!(), + }) + } + } + + /// Execute a POST request with JSON body and parse JSON response. + async fn post_json( + &self, + path: &str, + query: &[(&str, &str)], + body: &T, + operation: &str, + object_id: &str, + ) -> Result { + let url = format!("{}{}", self.rest_client.base_path(), path); + let req_builder = self.rest_client.client().post(&url).query(query).json(body); + + let resp = self + .rest_client + .execute(req_builder, operation, object_id) + .await + .map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + + let status = resp.status(); + let content = resp.text().await.map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + + if status.is_success() { + serde_json::from_str(&content).map_err(|e| Error::Namespace { + source: format!("Failed to parse response: {}", e).into(), + location: snafu::location!(), + }) + } else { + Err(Error::Namespace { + source: format!("Response error: status={}, content={}", status, content).into(), + location: snafu::location!(), + }) + } + } + + /// Execute a POST request that returns nothing (204 No Content expected). + async fn post_json_no_content( + &self, + path: &str, + query: &[(&str, &str)], + body: &T, + operation: &str, + object_id: &str, + ) -> Result<()> { + let url = format!("{}{}", self.rest_client.base_path(), path); + let req_builder = self.rest_client.client().post(&url).query(query).json(body); + + let resp = self + .rest_client + .execute(req_builder, operation, object_id) + .await + .map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + + let status = resp.status(); + if status.is_success() { + Ok(()) + } else { + let content = resp.text().await.map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + Err(Error::Namespace { + source: format!("Response error: status={}, content={}", status, content).into(), + location: snafu::location!(), + }) + } + } + + /// Execute a POST request with binary body and parse JSON response. + async fn post_binary_json( + &self, + path: &str, + query: &[(&str, &str)], + body: Vec, + operation: &str, + object_id: &str, + ) -> Result { + let url = format!("{}{}", self.rest_client.base_path(), path); + let req_builder = self.rest_client.client().post(&url).query(query).body(body); + + let resp = self + .rest_client + .execute(req_builder, operation, object_id) + .await + .map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + + let status = resp.status(); + let content = resp.text().await.map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + + if status.is_success() { + serde_json::from_str(&content).map_err(|e| Error::Namespace { + source: format!("Failed to parse response: {}", e).into(), + location: snafu::location!(), + }) + } else { + Err(Error::Namespace { + source: format!("Response error: status={}, content={}", status, content).into(), + location: snafu::location!(), + }) + } + } + + /// Execute a POST request with JSON body and get binary response. + #[allow(dead_code)] + async fn post_json_binary( + &self, + path: &str, + query: &[(&str, &str)], + body: &T, + operation: &str, + object_id: &str, + ) -> Result { + let url = format!("{}{}", self.rest_client.base_path(), path); + let req_builder = self.rest_client.client().post(&url).query(query).json(body); + + let resp = self + .rest_client + .execute(req_builder, operation, object_id) + .await + .map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + + let status = resp.status(); + if status.is_success() { + resp.bytes().await.map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + }) + } else { + let content = resp.text().await.map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + Err(Error::Namespace { + source: format!("Response error: status={}, content={}", status, content).into(), + location: snafu::location!(), + }) } } /// Get the base endpoint URL for this namespace pub fn endpoint(&self) -> &str { - &self.reqwest_config.base_path + self.rest_client.base_path() } } @@ -389,16 +706,20 @@ impl LanceNamespace for RestNamespace { request: ListNamespacesRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - namespace_api::list_namespaces( - &self.reqwest_config, - &id, - Some(&self.delimiter), - request.page_token.as_deref(), - request.limit, - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/namespace/{}/list", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let page_token_str; + if let Some(ref pt) = request.page_token { + page_token_str = pt.clone(); + query.push(("page_token", page_token_str.as_str())); + } + let limit_str; + if let Some(limit) = request.limit { + limit_str = limit.to_string(); + query.push(("limit", limit_str.as_str())); + } + self.get_json(&path, &query, "list_namespaces", &id).await } async fn describe_namespace( @@ -406,10 +727,11 @@ impl LanceNamespace for RestNamespace { request: DescribeNamespaceRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - namespace_api::describe_namespace(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/namespace/{}/describe", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "describe_namespace", &id) .await - .map_err(convert_api_error) } async fn create_namespace( @@ -417,72 +739,93 @@ impl LanceNamespace for RestNamespace { request: CreateNamespaceRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - namespace_api::create_namespace(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/namespace/{}/create", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "create_namespace", &id) .await - .map_err(convert_api_error) } async fn drop_namespace(&self, request: DropNamespaceRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - namespace_api::drop_namespace(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/namespace/{}/drop", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "drop_namespace", &id) .await - .map_err(convert_api_error) } async fn namespace_exists(&self, request: NamespaceExistsRequest) -> Result<()> { let id = object_id_str(&request.id, &self.delimiter)?; - - namespace_api::namespace_exists(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/namespace/{}/exists", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json_no_content(&path, &query, &request, "namespace_exists", &id) .await - .map_err(convert_api_error) } async fn list_tables(&self, request: ListTablesRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::list_tables( - &self.reqwest_config, - &id, - Some(&self.delimiter), - request.page_token.as_deref(), - request.limit, - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/namespace/{}/table/list", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let page_token_str; + if let Some(ref pt) = request.page_token { + page_token_str = pt.clone(); + query.push(("page_token", page_token_str.as_str())); + } + let limit_str; + if let Some(limit) = request.limit { + limit_str = limit.to_string(); + query.push(("limit", limit_str.as_str())); + } + self.get_json(&path, &query, "list_tables", &id).await } async fn describe_table(&self, request: DescribeTableRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::describe_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/describe", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let with_uri_str; + if let Some(with_uri) = request.with_table_uri { + with_uri_str = with_uri.to_string(); + query.push(("with_table_uri", with_uri_str.as_str())); + } + let detailed_str; + if let Some(detailed) = request.load_detailed_metadata { + detailed_str = detailed.to_string(); + query.push(("load_detailed_metadata", detailed_str.as_str())); + } + self.post_json(&path, &query, &request, "describe_table", &id) .await - .map_err(convert_api_error) } async fn register_table(&self, request: RegisterTableRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::register_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/register", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "register_table", &id) .await - .map_err(convert_api_error) } async fn table_exists(&self, request: TableExistsRequest) -> Result<()> { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::table_exists(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/exists", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json_no_content(&path, &query, &request, "table_exists", &id) .await - .map_err(convert_api_error) } async fn drop_table(&self, request: DropTableRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::drop_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/drop", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "drop_table", &id) .await - .map_err(convert_api_error) } async fn deregister_table( @@ -490,18 +833,19 @@ impl LanceNamespace for RestNamespace { request: DeregisterTableRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::deregister_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/deregister", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "deregister_table", &id) .await - .map_err(convert_api_error) } async fn count_table_rows(&self, request: CountTableRowsRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::count_table_rows(&self.reqwest_config, &id, request, Some(&self.delimiter)) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/count_rows", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.get_json(&path, &query, "count_table_rows", &id).await } async fn create_table( @@ -510,30 +854,16 @@ impl LanceNamespace for RestNamespace { request_data: Bytes, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - let properties_json = request - .properties - .as_ref() - .map(|props| serde_json::to_string(props).unwrap_or_else(|_| "{}".to_string())); - - use lance_namespace::models::create_table_request::Mode; - let mode = request.mode.as_ref().map(|m| match m { - Mode::Create => "create", - Mode::ExistOk => "exist_ok", - Mode::Overwrite => "overwrite", - }); - - table_api::create_table( - &self.reqwest_config, - &id, - request_data.to_vec(), - Some(&self.delimiter), - mode, - request.location.as_deref(), - properties_json.as_deref(), - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/create", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let mode_str; + if let Some(ref mode) = request.mode { + mode_str = mode.clone(); + query.push(("mode", mode_str.as_str())); + } + self.post_binary_json(&path, &query, request_data.to_vec(), "create_table", &id) + .await } async fn create_empty_table( @@ -541,10 +871,20 @@ impl LanceNamespace for RestNamespace { request: CreateEmptyTableRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/create-empty", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "create_empty_table", &id) + .await + } - table_api::create_empty_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + async fn declare_table(&self, request: DeclareTableRequest) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/declare", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "declare_table", &id) .await - .map_err(convert_api_error) } async fn insert_into_table( @@ -553,22 +893,22 @@ impl LanceNamespace for RestNamespace { request_data: Bytes, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - use lance_namespace::models::insert_into_table_request::Mode; - let mode = request.mode.as_ref().map(|m| match m { - Mode::Append => "append", - Mode::Overwrite => "overwrite", - }); - - table_api::insert_into_table( - &self.reqwest_config, - &id, + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/insert", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let mode_str; + if let Some(ref mode) = request.mode { + mode_str = mode.clone(); + query.push(("mode", mode_str.as_str())); + } + self.post_binary_json( + &path, + &query, request_data.to_vec(), - Some(&self.delimiter), - mode, + "insert_into_table", + &id, ) .await - .map_err(convert_api_error) } async fn merge_insert_into_table( @@ -577,34 +917,72 @@ impl LanceNamespace for RestNamespace { request_data: Bytes, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); let on = request.on.as_deref().ok_or_else(|| Error::Namespace { source: "'on' field is required for merge insert".into(), location: snafu::location!(), })?; - table_api::merge_insert_into_table( - &self.reqwest_config, - &id, - on, + let path = format!("/v1/table/{}/merge_insert", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str()), ("on", on)]; + + let when_matched_update_all_str; + if let Some(v) = request.when_matched_update_all { + when_matched_update_all_str = v.to_string(); + query.push(( + "when_matched_update_all", + when_matched_update_all_str.as_str(), + )); + } + if let Some(ref v) = request.when_matched_update_all_filt { + query.push(("when_matched_update_all_filt", v.as_str())); + } + let when_not_matched_insert_all_str; + if let Some(v) = request.when_not_matched_insert_all { + when_not_matched_insert_all_str = v.to_string(); + query.push(( + "when_not_matched_insert_all", + when_not_matched_insert_all_str.as_str(), + )); + } + let when_not_matched_by_source_delete_str; + if let Some(v) = request.when_not_matched_by_source_delete { + when_not_matched_by_source_delete_str = v.to_string(); + query.push(( + "when_not_matched_by_source_delete", + when_not_matched_by_source_delete_str.as_str(), + )); + } + if let Some(ref v) = request.when_not_matched_by_source_delete_filt { + query.push(("when_not_matched_by_source_delete_filt", v.as_str())); + } + if let Some(ref v) = request.timeout { + query.push(("timeout", v.as_str())); + } + let use_index_str; + if let Some(v) = request.use_index { + use_index_str = v.to_string(); + query.push(("use_index", use_index_str.as_str())); + } + + self.post_binary_json( + &path, + &query, request_data.to_vec(), - Some(&self.delimiter), - request.when_matched_update_all, - request.when_matched_update_all_filt.as_deref(), - request.when_not_matched_insert_all, - request.when_not_matched_by_source_delete, - request.when_not_matched_by_source_delete_filt.as_deref(), + "merge_insert_into_table", + &id, ) .await - .map_err(convert_api_error) } async fn update_table(&self, request: UpdateTableRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::update_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/update", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "update_table", &id) .await - .map_err(convert_api_error) } async fn delete_from_table( @@ -612,27 +990,52 @@ impl LanceNamespace for RestNamespace { request: DeleteFromTableRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::delete_from_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/delete", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "delete_from_table", &id) .await - .map_err(convert_api_error) } async fn query_table(&self, request: QueryTableRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/query", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + + let url = format!("{}{}", self.rest_client.base_path(), path); + let req_builder = self + .rest_client + .client() + .post(&url) + .query(&query) + .json(&request); + + let resp = self + .rest_client + .execute(req_builder, "query_table", &id) + .await + .map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; - let response = - table_api::query_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) - .await - .map_err(convert_api_error)?; - - // Convert response to bytes - let bytes = response.bytes().await.map_err(|e| Error::IO { - source: box_error(e), - location: snafu::location!(), - })?; - - Ok(bytes) + let status = resp.status(); + if status.is_success() { + resp.bytes().await.map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + }) + } else { + let content = resp.text().await.map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + Err(Error::Namespace { + source: format!("Response error: status={}, content={}", status, content).into(), + location: snafu::location!(), + }) + } } async fn create_table_index( @@ -640,10 +1043,11 @@ impl LanceNamespace for RestNamespace { request: CreateTableIndexRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::create_table_index(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/create_index", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "create_table_index", &id) .await - .map_err(convert_api_error) } async fn list_table_indices( @@ -651,10 +1055,11 @@ impl LanceNamespace for RestNamespace { request: ListTableIndicesRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::list_table_indices(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/index/list", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "list_table_indices", &id) .await - .map_err(convert_api_error) } async fn describe_table_index_stats( @@ -662,20 +1067,16 @@ impl LanceNamespace for RestNamespace { request: DescribeTableIndexStatsRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - // Note: The index_name parameter seems to be missing from the request structure - // This might need to be adjusted based on the actual API - let index_name = ""; // This should come from somewhere in the request - - table_api::describe_table_index_stats( - &self.reqwest_config, - &id, - index_name, - request, - Some(&self.delimiter), - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let index_name = request.index_name.as_deref().unwrap_or(""); + let path = format!( + "/v1/table/{}/index/{}/stats", + encoded_id, + urlencode(index_name) + ); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "describe_table_index_stats", &id) + .await } async fn describe_transaction( @@ -683,15 +1084,11 @@ impl LanceNamespace for RestNamespace { request: DescribeTransactionRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - transaction_api::describe_transaction( - &self.reqwest_config, - &id, - request, - Some(&self.delimiter), - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/transaction/{}/describe", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "describe_transaction", &id) + .await } async fn alter_transaction( @@ -699,21 +1096,268 @@ impl LanceNamespace for RestNamespace { request: AlterTransactionRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/transaction/{}/alter", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "alter_transaction", &id) + .await + } - transaction_api::alter_transaction( - &self.reqwest_config, - &id, - request, - Some(&self.delimiter), - ) - .await - .map_err(convert_api_error) + async fn create_table_scalar_index( + &self, + request: CreateTableIndexRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/create_scalar_index", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "create_table_scalar_index", &id) + .await + } + + async fn drop_table_index( + &self, + request: DropTableIndexRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let index_name = request.index_name.as_deref().unwrap_or(""); + let path = format!( + "/v1/table/{}/index/{}/drop", + encoded_id, + urlencode(index_name) + ); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "drop_table_index", &id) + .await + } + + async fn list_all_tables(&self, request: ListTablesRequest) -> Result { + let path = "/v1/table"; + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let page_token_str; + if let Some(ref pt) = request.page_token { + page_token_str = pt.clone(); + query.push(("page_token", page_token_str.as_str())); + } + let limit_str; + if let Some(limit) = request.limit { + limit_str = limit.to_string(); + query.push(("limit", limit_str.as_str())); + } + self.get_json(path, &query, "list_all_tables", "").await + } + + async fn restore_table(&self, request: RestoreTableRequest) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/restore", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "restore_table", &id) + .await + } + + async fn rename_table(&self, request: RenameTableRequest) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/rename", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "rename_table", &id) + .await + } + + async fn list_table_versions( + &self, + request: ListTableVersionsRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/version/list", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let page_token_str; + if let Some(ref pt) = request.page_token { + page_token_str = pt.clone(); + query.push(("page_token", page_token_str.as_str())); + } + let limit_str; + if let Some(limit) = request.limit { + limit_str = limit.to_string(); + query.push(("limit", limit_str.as_str())); + } + self.get_json(&path, &query, "list_table_versions", &id) + .await + } + + async fn update_table_schema_metadata( + &self, + request: UpdateTableSchemaMetadataRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/schema_metadata/update", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + let metadata = request.metadata.unwrap_or_default(); + let result: HashMap = self + .post_json( + &path, + &query, + &metadata, + "update_table_schema_metadata", + &id, + ) + .await?; + Ok(UpdateTableSchemaMetadataResponse { + metadata: Some(result), + ..Default::default() + }) + } + + async fn get_table_stats( + &self, + request: GetTableStatsRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/stats", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "get_table_stats", &id) + .await + } + + async fn explain_table_query_plan( + &self, + request: ExplainTableQueryPlanRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/explain_plan", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "explain_table_query_plan", &id) + .await + } + + async fn analyze_table_query_plan( + &self, + request: AnalyzeTableQueryPlanRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/analyze_plan", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "analyze_table_query_plan", &id) + .await + } + + async fn alter_table_add_columns( + &self, + request: AlterTableAddColumnsRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/add_columns", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "alter_table_add_columns", &id) + .await + } + + async fn alter_table_alter_columns( + &self, + request: AlterTableAlterColumnsRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/alter_columns", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "alter_table_alter_columns", &id) + .await + } + + async fn alter_table_drop_columns( + &self, + request: AlterTableDropColumnsRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/drop_columns", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "alter_table_drop_columns", &id) + .await + } + + async fn list_table_tags( + &self, + request: ListTableTagsRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/tags/list", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let page_token_str; + if let Some(ref pt) = request.page_token { + page_token_str = pt.clone(); + query.push(("page_token", page_token_str.as_str())); + } + let limit_str; + if let Some(limit) = request.limit { + limit_str = limit.to_string(); + query.push(("limit", limit_str.as_str())); + } + self.get_json(&path, &query, "list_table_tags", &id).await + } + + async fn get_table_tag_version( + &self, + request: GetTableTagVersionRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/tags/version", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "get_table_tag_version", &id) + .await + } + + async fn create_table_tag( + &self, + request: CreateTableTagRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/tags/create", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "create_table_tag", &id) + .await + } + + async fn delete_table_tag( + &self, + request: DeleteTableTagRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/tags/delete", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "delete_table_tag", &id) + .await + } + + async fn update_table_tag( + &self, + request: UpdateTableTagRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/tags/update", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "update_table_tag", &id) + .await } fn namespace_id(&self) -> String { format!( "RestNamespace {{ endpoint: {:?}, delimiter: {:?} }}", - self.reqwest_config.base_path, self.delimiter + self.rest_client.base_path(), + self.delimiter ) } } @@ -722,7 +1366,6 @@ impl LanceNamespace for RestNamespace { mod tests { use super::*; use bytes::Bytes; - use lance_namespace::models::{create_table_request, insert_into_table_request}; use wiremock::matchers::{method, path}; use wiremock::{Mock, MockServer, ResponseTemplate}; @@ -784,8 +1427,7 @@ mod tests { let request = ListNamespacesRequest { id: Some(vec!["test".to_string()]), - page_token: None, - limit: None, + ..Default::default() }; let result = namespace.list_namespaces(request).await; @@ -900,15 +1542,12 @@ mod tests { .await; // Create namespace with mock server URL - let mut reqwest_config = Configuration::new(); - reqwest_config.base_path = mock_server.uri(); - - let namespace = RestNamespace::with_configuration("$".to_string(), reqwest_config); + let namespace = RestNamespaceBuilder::new(mock_server.uri()).build(); let request = ListNamespacesRequest { id: Some(vec!["test".to_string()]), - page_token: None, limit: Some(10), + ..Default::default() }; let result = namespace.list_namespaces(request).await; @@ -939,15 +1578,12 @@ mod tests { .await; // Create namespace with mock server URL - let mut reqwest_config = Configuration::new(); - reqwest_config.base_path = mock_server.uri(); - - let namespace = RestNamespace::with_configuration("$".to_string(), reqwest_config); + let namespace = RestNamespaceBuilder::new(mock_server.uri()).build(); let request = ListNamespacesRequest { id: Some(vec!["test".to_string()]), - page_token: None, limit: Some(10), + ..Default::default() }; let result = namespace.list_namespaces(request).await; @@ -975,15 +1611,11 @@ mod tests { .await; // Create namespace with mock server URL - let mut reqwest_config = Configuration::new(); - reqwest_config.base_path = mock_server.uri(); - - let namespace = RestNamespace::with_configuration("$".to_string(), reqwest_config); + let namespace = RestNamespaceBuilder::new(mock_server.uri()).build(); let request = CreateNamespaceRequest { id: Some(vec!["test".to_string(), "newnamespace".to_string()]), - properties: None, - mode: None, + ..Default::default() }; let result = namespace.create_namespace(request).await; @@ -1012,10 +1644,7 @@ mod tests { .await; // Create namespace with mock server URL - let mut reqwest_config = Configuration::new(); - reqwest_config.base_path = mock_server.uri(); - - let namespace = RestNamespace::with_configuration("$".to_string(), reqwest_config); + let namespace = RestNamespaceBuilder::new(mock_server.uri()).build(); let request = CreateTableRequest { id: Some(vec![ @@ -1023,9 +1652,8 @@ mod tests { "namespace".to_string(), "table".to_string(), ]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), + ..Default::default() }; let data = Bytes::from("arrow data here"); @@ -1045,16 +1673,13 @@ mod tests { Mock::given(method("POST")) .and(path(path_str.as_str())) .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ - "version": 2 + "transaction_id": "txn-123" }))) .mount(&mock_server) .await; // Create namespace with mock server URL - let mut reqwest_config = Configuration::new(); - reqwest_config.base_path = mock_server.uri(); - - let namespace = RestNamespace::with_configuration("$".to_string(), reqwest_config); + let namespace = RestNamespaceBuilder::new(mock_server.uri()).build(); let request = InsertIntoTableRequest { id: Some(vec![ @@ -1062,7 +1687,8 @@ mod tests { "namespace".to_string(), "table".to_string(), ]), - mode: Some(insert_into_table_request::Mode::Append), + mode: Some("Append".to_string()), + ..Default::default() }; let data = Bytes::from("arrow data here"); @@ -1071,6 +1697,178 @@ mod tests { // Should succeed with mock server assert!(result.is_ok()); let response = result.unwrap(); - assert_eq!(response.version, Some(2)); + assert_eq!(response.transaction_id, Some("txn-123".to_string())); + } + + // Integration tests for DynamicContextProvider + + #[derive(Debug)] + struct TestContextProvider { + headers: HashMap, + } + + impl DynamicContextProvider for TestContextProvider { + fn provide_context(&self, _info: &OperationInfo) -> HashMap { + self.headers.clone() + } + } + + #[tokio::test] + async fn test_context_provider_headers_sent() { + let mock_server = MockServer::start().await; + + // Mock expects the context header + Mock::given(method("GET")) + .and(path("/v1/namespace/test/list")) + .and(wiremock::matchers::header( + "X-Context-Token", + "dynamic-token", + )) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "namespaces": [] + }))) + .mount(&mock_server) + .await; + + // Create context provider + let mut context_headers = HashMap::new(); + context_headers.insert( + "headers.X-Context-Token".to_string(), + "dynamic-token".to_string(), + ); + let provider = Arc::new(TestContextProvider { + headers: context_headers, + }); + + let namespace = RestNamespaceBuilder::new(mock_server.uri()) + .context_provider(provider) + .build(); + + let request = ListNamespacesRequest { + id: Some(vec!["test".to_string()]), + ..Default::default() + }; + + let result = namespace.list_namespaces(request).await; + assert!(result.is_ok(), "Failed: {:?}", result.err()); + } + + #[tokio::test] + async fn test_base_headers_merged_with_context_headers() { + let mock_server = MockServer::start().await; + + // Mock expects BOTH base header AND context header + Mock::given(method("GET")) + .and(path("/v1/namespace/test/list")) + .and(wiremock::matchers::header( + "Authorization", + "Bearer base-token", + )) + .and(wiremock::matchers::header( + "X-Context-Token", + "dynamic-token", + )) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "namespaces": [] + }))) + .mount(&mock_server) + .await; + + // Create context provider + let mut context_headers = HashMap::new(); + context_headers.insert( + "headers.X-Context-Token".to_string(), + "dynamic-token".to_string(), + ); + let provider = Arc::new(TestContextProvider { + headers: context_headers, + }); + + // Create namespace with base header AND context provider + let namespace = RestNamespaceBuilder::new(mock_server.uri()) + .header("Authorization", "Bearer base-token") + .context_provider(provider) + .build(); + + let request = ListNamespacesRequest { + id: Some(vec!["test".to_string()]), + ..Default::default() + }; + + let result = namespace.list_namespaces(request).await; + assert!(result.is_ok(), "Failed: {:?}", result.err()); + } + + #[tokio::test] + async fn test_context_headers_override_base_headers() { + let mock_server = MockServer::start().await; + + // Mock expects the CONTEXT header value (not base) + Mock::given(method("GET")) + .and(path("/v1/namespace/test/list")) + .and(wiremock::matchers::header( + "Authorization", + "Bearer context-override-token", + )) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "namespaces": [] + }))) + .mount(&mock_server) + .await; + + // Context provider that overrides Authorization header + let mut context_headers = HashMap::new(); + context_headers.insert( + "headers.Authorization".to_string(), + "Bearer context-override-token".to_string(), + ); + let provider = Arc::new(TestContextProvider { + headers: context_headers, + }); + + // Create namespace with base header that will be overridden + let namespace = RestNamespaceBuilder::new(mock_server.uri()) + .header("Authorization", "Bearer base-token") + .context_provider(provider) + .build(); + + let request = ListNamespacesRequest { + id: Some(vec!["test".to_string()]), + ..Default::default() + }; + + let result = namespace.list_namespaces(request).await; + assert!(result.is_ok(), "Failed: {:?}", result.err()); + } + + #[tokio::test] + async fn test_no_context_provider_uses_base_headers_only() { + let mock_server = MockServer::start().await; + + // Mock expects only the base header + Mock::given(method("GET")) + .and(path("/v1/namespace/test/list")) + .and(wiremock::matchers::header( + "Authorization", + "Bearer base-only", + )) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "namespaces": [] + }))) + .mount(&mock_server) + .await; + + // Create namespace WITHOUT context provider, only base headers + let namespace = RestNamespaceBuilder::new(mock_server.uri()) + .header("Authorization", "Bearer base-only") + .build(); + + let request = ListNamespacesRequest { + id: Some(vec!["test".to_string()]), + ..Default::default() + }; + + let result = namespace.list_namespaces(request).await; + assert!(result.is_ok(), "Failed: {:?}", result.err()); } } diff --git a/rust/lance-namespace-impls/src/rest_adapter.rs b/rust/lance-namespace-impls/src/rest_adapter.rs index 5e06f64570e..b63331c8a66 100644 --- a/rust/lance-namespace-impls/src/rest_adapter.rs +++ b/rust/lance-namespace-impls/src/rest_adapter.rs @@ -12,7 +12,7 @@ use std::sync::Arc; use axum::{ body::Bytes, extract::{Path, Query, Request, State}, - http::StatusCode, + http::{HeaderMap, StatusCode}, response::{IntoResponse, Response}, routing::{get, post}, Json, Router, ServiceExt, @@ -67,14 +67,66 @@ impl RestAdapter { .route("/v1/namespace/:id/drop", post(drop_namespace)) .route("/v1/namespace/:id/exists", post(namespace_exists)) .route("/v1/namespace/:id/table/list", get(list_tables)) - // Table operations + // Table metadata operations .route("/v1/table/:id/register", post(register_table)) .route("/v1/table/:id/describe", post(describe_table)) .route("/v1/table/:id/exists", post(table_exists)) .route("/v1/table/:id/drop", post(drop_table)) .route("/v1/table/:id/deregister", post(deregister_table)) + .route("/v1/table/:id/rename", post(rename_table)) + .route("/v1/table/:id/restore", post(restore_table)) + .route("/v1/table/:id/version/list", get(list_table_versions)) + .route("/v1/table/:id/stats", get(get_table_stats)) + // Table data operations .route("/v1/table/:id/create", post(create_table)) .route("/v1/table/:id/create-empty", post(create_empty_table)) + .route("/v1/table/:id/declare", post(declare_table)) + .route("/v1/table/:id/insert", post(insert_into_table)) + .route("/v1/table/:id/merge_insert", post(merge_insert_into_table)) + .route("/v1/table/:id/update", post(update_table)) + .route("/v1/table/:id/delete", post(delete_from_table)) + .route("/v1/table/:id/query", post(query_table)) + .route("/v1/table/:id/count_rows", get(count_table_rows)) + // Index operations + .route("/v1/table/:id/create_index", post(create_table_index)) + .route( + "/v1/table/:id/create_scalar_index", + post(create_table_scalar_index), + ) + .route("/v1/table/:id/index/list", get(list_table_indices)) + .route( + "/v1/table/:id/index/:index_name/stats", + get(describe_table_index_stats), + ) + .route( + "/v1/table/:id/index/:index_name/drop", + post(drop_table_index), + ) + // Schema operations + .route("/v1/table/:id/add_columns", post(alter_table_add_columns)) + .route( + "/v1/table/:id/alter_columns", + post(alter_table_alter_columns), + ) + .route("/v1/table/:id/drop_columns", post(alter_table_drop_columns)) + .route( + "/v1/table/:id/schema_metadata/update", + post(update_table_schema_metadata), + ) + // Tag operations + .route("/v1/table/:id/tags/list", get(list_table_tags)) + .route("/v1/table/:id/tags/version", post(get_table_tag_version)) + .route("/v1/table/:id/tags/create", post(create_table_tag)) + .route("/v1/table/:id/tags/delete", post(delete_table_tag)) + .route("/v1/table/:id/tags/update", post(update_table_tag)) + // Query plan operations + .route("/v1/table/:id/explain_plan", post(explain_table_query_plan)) + .route("/v1/table/:id/analyze_plan", post(analyze_table_query_plan)) + // Transaction operations + .route("/v1/transaction/:id/describe", post(describe_transaction)) + .route("/v1/transaction/:id/alter", post(alter_transaction)) + // Global table operations + .route("/v1/table", get(list_all_tables)) .layer(TraceLayer::new_for_http()) .with_state(self.backend.clone()) } @@ -260,11 +312,13 @@ fn error_to_response(err: Error) -> Response { async fn create_namespace( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.create_namespace(request).await { Ok(response) => (StatusCode::CREATED, Json(response)).into_response(), @@ -274,6 +328,7 @@ async fn create_namespace( async fn list_namespaces( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, ) -> Response { @@ -281,6 +336,8 @@ async fn list_namespaces( id: Some(parse_id(&id, params.delimiter.as_deref())), page_token: params.page_token, limit: params.limit, + identity: extract_identity(&headers), + ..Default::default() }; match backend.list_namespaces(request).await { @@ -291,11 +348,13 @@ async fn list_namespaces( async fn describe_namespace( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.describe_namespace(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -305,11 +364,13 @@ async fn describe_namespace( async fn drop_namespace( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.drop_namespace(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -319,11 +380,13 @@ async fn drop_namespace( async fn namespace_exists( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.namespace_exists(request).await { Ok(_) => StatusCode::NO_CONTENT.into_response(), @@ -337,6 +400,7 @@ async fn namespace_exists( async fn list_tables( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, ) -> Response { @@ -344,6 +408,8 @@ async fn list_tables( id: Some(parse_id(&id, params.delimiter.as_deref())), page_token: params.page_token, limit: params.limit, + identity: extract_identity(&headers), + ..Default::default() }; match backend.list_tables(request).await { @@ -354,11 +420,13 @@ async fn list_tables( async fn register_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.register_table(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -368,11 +436,13 @@ async fn register_table( async fn describe_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.describe_table(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -382,11 +452,13 @@ async fn describe_table( async fn table_exists( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.table_exists(request).await { Ok(_) => StatusCode::NO_CONTENT.into_response(), @@ -396,11 +468,15 @@ async fn table_exists( async fn drop_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, - Json(mut request): Json, ) -> Response { - request.id = Some(parse_id(&id, params.delimiter.as_deref())); + let request = DropTableRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + identity: extract_identity(&headers), + ..Default::default() + }; match backend.drop_table(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -410,11 +486,13 @@ async fn drop_table( async fn deregister_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.deregister_table(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -430,35 +508,20 @@ async fn deregister_table( struct CreateTableQuery { delimiter: Option, mode: Option, - location: Option, - properties: Option, } async fn create_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, body: Bytes, ) -> Response { - use lance_namespace::models::create_table_request::Mode; - - let mode = params.mode.as_deref().and_then(|m| match m { - "create" => Some(Mode::Create), - "exist_ok" => Some(Mode::ExistOk), - "overwrite" => Some(Mode::Overwrite), - _ => None, - }); - - let properties = params - .properties - .as_ref() - .and_then(|p| serde_json::from_str(p).ok()); - let request = CreateTableRequest { id: Some(parse_id(&id, params.delimiter.as_deref())), - location: params.location, - mode, - properties, + mode: params.mode.clone(), + identity: extract_identity(&headers), + ..Default::default() }; match backend.create_table(request, body).await { @@ -467,13 +530,16 @@ async fn create_table( } } +#[allow(deprecated)] async fn create_empty_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.create_empty_table(request).await { Ok(response) => (StatusCode::CREATED, Json(response)).into_response(), @@ -481,6 +547,593 @@ async fn create_empty_table( } } +async fn declare_table( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.declare_table(request).await { + Ok(response) => (StatusCode::CREATED, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +#[derive(Debug, Deserialize)] +struct InsertQuery { + delimiter: Option, + mode: Option, +} + +async fn insert_into_table( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + body: Bytes, +) -> Response { + let request = InsertIntoTableRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + mode: params.mode.clone(), + identity: extract_identity(&headers), + ..Default::default() + }; + + match backend.insert_into_table(request, body).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +#[derive(Debug, Deserialize)] +struct MergeInsertQuery { + delimiter: Option, + on: Option, + when_matched_update_all: Option, + when_matched_update_all_filt: Option, + when_not_matched_insert_all: Option, + when_not_matched_by_source_delete: Option, + when_not_matched_by_source_delete_filt: Option, + timeout: Option, + use_index: Option, +} + +async fn merge_insert_into_table( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + body: Bytes, +) -> Response { + let request = MergeInsertIntoTableRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + on: params.on, + when_matched_update_all: params.when_matched_update_all, + when_matched_update_all_filt: params.when_matched_update_all_filt, + when_not_matched_insert_all: params.when_not_matched_insert_all, + when_not_matched_by_source_delete: params.when_not_matched_by_source_delete, + when_not_matched_by_source_delete_filt: params.when_not_matched_by_source_delete_filt, + timeout: params.timeout, + use_index: params.use_index, + identity: extract_identity(&headers), + ..Default::default() + }; + + match backend.merge_insert_into_table(request, body).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn update_table( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.update_table(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn delete_from_table( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.delete_from_table(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn query_table( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.query_table(request).await { + Ok(bytes) => (StatusCode::OK, bytes).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn count_table_rows( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, +) -> Response { + let request = CountTableRowsRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + version: None, + predicate: None, + identity: extract_identity(&headers), + ..Default::default() + }; + + match backend.count_table_rows(request).await { + Ok(count) => (StatusCode::OK, Json(serde_json::json!({ "count": count }))).into_response(), + Err(e) => error_to_response(e), + } +} + +// ============================================================================ +// Table Management Operation Handlers +// ============================================================================ + +async fn rename_table( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.rename_table(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn restore_table( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.restore_table(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn list_table_versions( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, +) -> Response { + let request = ListTableVersionsRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + page_token: params.page_token, + limit: params.limit, + identity: extract_identity(&headers), + ..Default::default() + }; + + match backend.list_table_versions(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn get_table_stats( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, +) -> Response { + let request = GetTableStatsRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + identity: extract_identity(&headers), + ..Default::default() + }; + + match backend.get_table_stats(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn list_all_tables( + State(backend): State>, + headers: HeaderMap, + Query(params): Query, +) -> Response { + let request = ListTablesRequest { + id: None, + page_token: params.page_token, + limit: params.limit, + identity: extract_identity(&headers), + ..Default::default() + }; + + match backend.list_all_tables(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +// ============================================================================ +// Index Operation Handlers +// ============================================================================ + +async fn create_table_index( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.create_table_index(request).await { + Ok(response) => (StatusCode::CREATED, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn create_table_scalar_index( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.create_table_scalar_index(request).await { + Ok(response) => (StatusCode::CREATED, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn list_table_indices( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, +) -> Response { + let request = ListTableIndicesRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + version: None, + page_token: None, + limit: None, + identity: extract_identity(&headers), + ..Default::default() + }; + + match backend.list_table_indices(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +#[derive(Debug, Deserialize)] +struct IndexPathParams { + id: String, + index_name: String, +} + +async fn describe_table_index_stats( + State(backend): State>, + headers: HeaderMap, + Path(params): Path, + Query(query): Query, +) -> Response { + let request = DescribeTableIndexStatsRequest { + id: Some(parse_id(¶ms.id, query.delimiter.as_deref())), + version: None, + index_name: Some(params.index_name), + identity: extract_identity(&headers), + ..Default::default() + }; + + match backend.describe_table_index_stats(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn drop_table_index( + State(backend): State>, + headers: HeaderMap, + Path(params): Path, + Query(query): Query, +) -> Response { + let request = DropTableIndexRequest { + id: Some(parse_id(¶ms.id, query.delimiter.as_deref())), + index_name: Some(params.index_name), + identity: extract_identity(&headers), + ..Default::default() + }; + + match backend.drop_table_index(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +// ============================================================================ +// Schema Operation Handlers +// ============================================================================ + +async fn alter_table_add_columns( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.alter_table_add_columns(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn alter_table_alter_columns( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.alter_table_alter_columns(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn alter_table_drop_columns( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.alter_table_drop_columns(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn update_table_schema_metadata( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.update_table_schema_metadata(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +// ============================================================================ +// Tag Operation Handlers +// ============================================================================ + +async fn list_table_tags( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, +) -> Response { + let request = ListTableTagsRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + page_token: params.page_token, + limit: params.limit, + identity: extract_identity(&headers), + ..Default::default() + }; + + match backend.list_table_tags(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn get_table_tag_version( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.get_table_tag_version(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn create_table_tag( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.create_table_tag(request).await { + Ok(response) => (StatusCode::CREATED, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn delete_table_tag( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.delete_table_tag(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn update_table_tag( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.update_table_tag(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +// ============================================================================ +// Query Plan Operation Handlers +// ============================================================================ + +async fn explain_table_query_plan( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.explain_table_query_plan(request).await { + Ok(plan) => (StatusCode::OK, plan).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn analyze_table_query_plan( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.analyze_table_query_plan(request).await { + Ok(plan) => (StatusCode::OK, plan).into_response(), + Err(e) => error_to_response(e), + } +} + +// ============================================================================ +// Transaction Operation Handlers +// ============================================================================ + +async fn describe_transaction( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(_params): Query, + Json(mut request): Json, +) -> Response { + // The path id is the transaction identifier + // The request.id in body is the table ID (namespace path) + // For the trait, we set request.id to include both table ID and transaction ID + // by appending the transaction ID to the table ID path + if let Some(ref mut table_id) = request.id { + table_id.push(id); + } else { + request.id = Some(vec![id]); + } + request.identity = extract_identity(&headers); + + match backend.describe_transaction(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn alter_transaction( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(_params): Query, + Json(mut request): Json, +) -> Response { + // The path id is the transaction identifier + // Append it to the table ID path in the request + if let Some(ref mut table_id) = request.id { + table_id.push(id); + } else { + request.id = Some(vec![id]); + } + request.identity = extract_identity(&headers); + + match backend.alter_transaction(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + // ============================================================================ // Helper Functions // ============================================================================ @@ -501,6 +1154,36 @@ fn parse_id(id_str: &str, delimiter: Option<&str>) -> Vec { .collect() } +/// Extract identity information from HTTP headers +/// +/// Extracts `x-api-key` and `Authorization` (Bearer token) headers and returns +/// an Identity object if either is present. +fn extract_identity(headers: &HeaderMap) -> Option> { + let api_key = headers + .get("x-api-key") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + + let auth_token = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .and_then(|s| { + // Extract token from "Bearer " format + s.strip_prefix("Bearer ") + .or_else(|| s.strip_prefix("bearer ")) + .map(|t| t.to_string()) + }); + + if api_key.is_some() || auth_token.is_some() { + Some(Box::new(Identity { + api_key, + auth_token, + })) + } else { + None + } +} + #[cfg(test)] mod tests { use super::*; @@ -644,6 +1327,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -698,6 +1382,7 @@ mod tests { id: Some(vec![format!("namespace{}", i)]), properties: None, mode: None, + ..Default::default() }; let result = fixture.namespace.create_namespace(create_req).await; assert!(result.is_ok(), "Failed to create namespace{}", i); @@ -708,6 +1393,7 @@ mod tests { id: Some(vec![]), page_token: None, limit: None, + ..Default::default() }; let result = fixture.namespace.list_namespaces(list_req).await; assert!(result.is_ok()); @@ -727,6 +1413,7 @@ mod tests { id: Some(vec!["parent".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -739,6 +1426,7 @@ mod tests { id: Some(vec!["parent".to_string(), "child1".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -750,6 +1438,7 @@ mod tests { id: Some(vec!["parent".to_string(), "child2".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -762,6 +1451,7 @@ mod tests { id: Some(vec!["parent".to_string()]), page_token: None, limit: None, + ..Default::default() }; let result = fixture.namespace.list_namespaces(list_req).await; assert!(result.is_ok()); @@ -781,6 +1471,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -791,9 +1482,8 @@ mod tests { // Create table in child namespace let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), + ..Default::default() }; let result = fixture @@ -834,6 +1524,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -845,9 +1536,8 @@ mod tests { for i in 1..=3 { let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), format!("table{}", i)]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -861,6 +1551,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), page_token: None, limit: None, + ..Default::default() }; let result = fixture.namespace.list_tables(list_req).await; assert!(result.is_ok()); @@ -881,6 +1572,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -891,9 +1583,8 @@ mod tests { // Create table let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -909,6 +1600,7 @@ mod tests { } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + #[allow(deprecated)] async fn test_empty_table_exists_in_child_namespace() { let fixture = RestServerFixture::new().await; @@ -917,6 +1609,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -953,6 +1646,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -963,9 +1657,8 @@ mod tests { // Create table let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -1043,6 +1736,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1053,9 +1747,8 @@ mod tests { // Create table let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -1066,6 +1759,7 @@ mod tests { // Drop the table let drop_req = DropTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), + ..Default::default() }; let result = fixture.namespace.drop_table(drop_req).await; assert!( @@ -1084,6 +1778,7 @@ mod tests { } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + #[allow(deprecated)] async fn test_create_empty_table_in_child_namespace() { let fixture = RestServerFixture::new().await; @@ -1092,6 +1787,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1138,6 +1834,7 @@ mod tests { } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + #[allow(deprecated)] async fn test_describe_empty_table_in_child_namespace() { let fixture = RestServerFixture::new().await; @@ -1146,6 +1843,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1193,6 +1891,7 @@ mod tests { } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + #[allow(deprecated)] async fn test_drop_empty_table_in_child_namespace() { let fixture = RestServerFixture::new().await; @@ -1201,6 +1900,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1220,6 +1920,7 @@ mod tests { // Drop the empty table let drop_req = DropTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), + ..Default::default() }; let result = fixture.namespace.drop_table(drop_req).await; assert!( @@ -1238,6 +1939,7 @@ mod tests { } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + #[allow(deprecated)] async fn test_deeply_nested_namespace_with_empty_table() { let fixture = RestServerFixture::new().await; @@ -1246,6 +1948,7 @@ mod tests { id: Some(vec!["level1".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1257,6 +1960,7 @@ mod tests { id: Some(vec!["level1".to_string(), "level2".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1272,6 +1976,7 @@ mod tests { ]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1320,6 +2025,7 @@ mod tests { id: Some(vec!["level1".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1331,6 +2037,7 @@ mod tests { id: Some(vec!["level1".to_string(), "level2".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1346,6 +2053,7 @@ mod tests { ]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1361,9 +2069,8 @@ mod tests { "level3".to_string(), "deep_table".to_string(), ]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), + ..Default::default() }; let result = fixture @@ -1401,6 +2108,7 @@ mod tests { id: Some(vec!["namespace1".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1412,6 +2120,7 @@ mod tests { id: Some(vec!["namespace2".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1422,9 +2131,8 @@ mod tests { // Create table with same name in both namespaces let create_table_req = CreateTableRequest { id: Some(vec!["namespace1".to_string(), "shared_table".to_string()]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -1434,9 +2142,8 @@ mod tests { let create_table_req = CreateTableRequest { id: Some(vec!["namespace2".to_string(), "shared_table".to_string()]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -1447,6 +2154,7 @@ mod tests { // Drop table in namespace1 let drop_req = DropTableRequest { id: Some(vec!["namespace1".to_string(), "shared_table".to_string()]), + ..Default::default() }; fixture.namespace.drop_table(drop_req).await.unwrap(); @@ -1476,6 +2184,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1486,9 +2195,8 @@ mod tests { // Create table in namespace let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -1521,6 +2229,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1540,6 +2249,7 @@ mod tests { // Verify namespace no longer exists let exists_req = NamespaceExistsRequest { id: Some(vec!["test_namespace".to_string()]), + ..Default::default() }; let result = fixture.namespace.namespace_exists(exists_req).await; assert!(result.is_err(), "Namespace should not exist after drop"); @@ -1560,6 +2270,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: Some(properties.clone()), mode: None, + ..Default::default() }; fixture .namespace @@ -1570,6 +2281,7 @@ mod tests { // Describe namespace and verify properties let describe_req = DescribeNamespaceRequest { id: Some(vec!["test_namespace".to_string()]), + ..Default::default() }; let result = fixture.namespace.describe_namespace(describe_req).await; assert!(result.is_ok()); @@ -1585,7 +2297,10 @@ mod tests { let fixture = RestServerFixture::new().await; // Root namespace should always exist - let exists_req = NamespaceExistsRequest { id: Some(vec![]) }; + let exists_req = NamespaceExistsRequest { + id: Some(vec![]), + ..Default::default() + }; let result = fixture.namespace.namespace_exists(exists_req).await; assert!(result.is_ok(), "Root namespace should exist"); @@ -1594,6 +2309,7 @@ mod tests { id: Some(vec![]), properties: None, mode: None, + ..Default::default() }; let result = fixture.namespace.create_namespace(create_req).await; assert!(result.is_err(), "Cannot create root namespace"); @@ -1627,6 +2343,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1640,9 +2357,8 @@ mod tests { "test_namespace".to_string(), "physical_table".to_string(), ]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -1659,6 +2375,7 @@ mod tests { location: "test_namespace$physical_table.lance".to_string(), mode: None, properties: None, + ..Default::default() }; let result = fixture.namespace.register_table(register_req).await; @@ -1669,7 +2386,10 @@ mod tests { ); let response = result.unwrap(); - assert_eq!(response.location, "test_namespace$physical_table.lance"); + assert_eq!( + response.location, + Some("test_namespace$physical_table.lance".to_string()) + ); // Verify registered table exists let mut exists_req = TableExistsRequest::new(); @@ -1690,6 +2410,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1703,6 +2424,7 @@ mod tests { location: "s3://bucket/table.lance".to_string(), mode: None, properties: None, + ..Default::default() }; let result = fixture.namespace.register_table(register_req).await; @@ -1724,6 +2446,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1737,6 +2460,7 @@ mod tests { location: "../outside/table.lance".to_string(), mode: None, properties: None, + ..Default::default() }; let result = fixture.namespace.register_table(register_req).await; @@ -1759,6 +2483,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1769,9 +2494,8 @@ mod tests { // Create a table let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -1791,6 +2515,7 @@ mod tests { // Deregister the table let deregister_req = DeregisterTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), + ..Default::default() }; let result = fixture.namespace.deregister_table(deregister_req).await; assert!( @@ -1836,6 +2561,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1849,9 +2575,8 @@ mod tests { "test_namespace".to_string(), "original_table".to_string(), ]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), + ..Default::default() }; let create_response = fixture .namespace @@ -1865,6 +2590,7 @@ mod tests { "test_namespace".to_string(), "original_table".to_string(), ]), + ..Default::default() }; fixture .namespace @@ -1894,6 +2620,7 @@ mod tests { location: relative_location.clone(), mode: None, properties: None, + ..Default::default() }; let register_response = fixture @@ -1903,7 +2630,7 @@ mod tests { .expect("Failed to re-register table with new name"); // Should return the exact location we registered - assert_eq!(register_response.location, relative_location); + assert_eq!(register_response.location, Some(relative_location.clone())); // Verify new table exists let mut exists_req = TableExistsRequest::new(); @@ -1968,15 +2695,10 @@ mod tests { .unwrap(); let reader1 = RecordBatchIterator::new(vec![data1].into_iter().map(Ok), schema.clone()); - let dataset = Dataset::write_into_namespace( - reader1, - namespace.clone(), - table_id.clone(), - None, - false, - ) - .await - .unwrap(); + let dataset = + Dataset::write_into_namespace(reader1, namespace.clone(), table_id.clone(), None) + .await + .unwrap(); assert_eq!(dataset.count_rows(None).await.unwrap(), 3); assert_eq!(dataset.version().version, 1); @@ -2002,7 +2724,6 @@ mod tests { namespace.clone(), table_id.clone(), Some(params_append), - false, ) .await .unwrap(); @@ -2031,7 +2752,6 @@ mod tests { namespace.clone(), table_id.clone(), Some(params_overwrite), - false, ) .await .unwrap(); @@ -2049,5 +2769,131 @@ mod tests { .unwrap(); assert_eq!(a_col.values(), &[100, 200]); } + + // ============================================================================ + // DynamicContextProvider Integration Test + // ============================================================================ + + use crate::context::{DynamicContextProvider, OperationInfo}; + use std::collections::HashMap; + + /// Test context provider that adds custom headers to every request. + #[derive(Debug)] + struct TestDynamicContextProvider { + headers: HashMap, + } + + impl DynamicContextProvider for TestDynamicContextProvider { + fn provide_context(&self, _info: &OperationInfo) -> HashMap { + self.headers.clone() + } + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_rest_namespace_with_context_provider() { + let temp_dir = TempDir::new().unwrap(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + + // Create DirectoryNamespace backend with manifest enabled + let backend = DirectoryNamespaceBuilder::new(&temp_path) + .manifest_enabled(true) + .build() + .await + .unwrap(); + let backend = Arc::new(backend); + + // Start REST server + let config = RestAdapterConfig { + port: 0, + ..Default::default() + }; + + let server = RestAdapter::new(backend.clone(), config); + let server_handle = server.start().await.unwrap(); + let actual_port = server_handle.port(); + + // Create context provider that adds custom headers + let mut context_headers = HashMap::new(); + context_headers.insert( + "headers.X-Custom-Auth".to_string(), + "test-auth-token".to_string(), + ); + context_headers.insert( + "headers.X-Request-Source".to_string(), + "integration-test".to_string(), + ); + + let provider = Arc::new(TestDynamicContextProvider { + headers: context_headers, + }); + + // Create RestNamespace client with context provider and base headers + let server_url = format!("http://127.0.0.1:{}", actual_port); + let namespace = RestNamespaceBuilder::new(&server_url) + .delimiter("$") + .header("X-Base-Header", "base-value") + .context_provider(provider) + .build(); + + // Create a namespace - should work with context provider + let create_req = CreateNamespaceRequest { + id: Some(vec!["context_test_ns".to_string()]), + properties: None, + mode: None, + identity: None, + context: None, + }; + let result = namespace.create_namespace(create_req).await; + assert!(result.is_ok(), "Failed to create namespace: {:?}", result); + + // List namespaces - should also work + let list_req = ListNamespacesRequest { + id: Some(vec![]), + limit: Some(10), + page_token: None, + identity: None, + context: None, + }; + let result = namespace.list_namespaces(list_req).await; + assert!(result.is_ok(), "Failed to list namespaces: {:?}", result); + let response = result.unwrap(); + assert!( + response.namespaces.contains(&"context_test_ns".to_string()), + "Namespace not found in list" + ); + + // Create a table - should work with context provider + let table_data = create_test_arrow_data(); + let create_table_req = CreateTableRequest { + id: Some(vec![ + "context_test_ns".to_string(), + "test_table".to_string(), + ]), + mode: Some("create".to_string()), + identity: None, + context: None, + }; + let result = namespace.create_table(create_table_req, table_data).await; + assert!(result.is_ok(), "Failed to create table: {:?}", result); + + // Describe the table - should work with context provider + let describe_req = DescribeTableRequest { + id: Some(vec![ + "context_test_ns".to_string(), + "test_table".to_string(), + ]), + with_table_uri: None, + load_detailed_metadata: None, + vend_credentials: None, + version: None, + identity: None, + context: None, + }; + let result = namespace.describe_table(describe_req).await; + assert!(result.is_ok(), "Failed to describe table: {:?}", result); + + // Cleanup + server_handle.shutdown(); + } } } diff --git a/rust/lance-namespace/src/error.rs b/rust/lance-namespace/src/error.rs new file mode 100644 index 00000000000..71fb7c12c31 --- /dev/null +++ b/rust/lance-namespace/src/error.rs @@ -0,0 +1,404 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Lance Namespace error types. +//! +//! This module defines fine-grained error types for Lance Namespace operations. +//! Each error type has a unique numeric code that is consistent across all +//! Lance Namespace implementations (Python, Java, Rust, REST). +//! +//! # Error Handling +//! +//! Namespace operations return [`NamespaceError`] which can be converted to +//! [`lance_core::Error`] for integration with the Lance ecosystem. +//! +//! ```rust,ignore +//! use lance_namespace::{NamespaceError, ErrorCode}; +//! +//! // Create and use namespace errors +//! let err = NamespaceError::TableNotFound { +//! message: "Table 'users' not found".into(), +//! }; +//! assert_eq!(err.code(), ErrorCode::TableNotFound); +//! +//! // Convert to lance_core::Error +//! let lance_err: lance_core::Error = err.into(); +//! ``` + +use lance_core::error::ToSnafuLocation; +use snafu::Snafu; + +/// Lance Namespace error codes. +/// +/// These codes are globally unique across all Lance Namespace implementations +/// (Python, Java, Rust, REST). Use these codes for programmatic error handling. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[repr(u32)] +pub enum ErrorCode { + /// Operation not supported by this backend + Unsupported = 0, + /// The specified namespace does not exist + NamespaceNotFound = 1, + /// A namespace with this name already exists + NamespaceAlreadyExists = 2, + /// Namespace contains tables or child namespaces + NamespaceNotEmpty = 3, + /// The specified table does not exist + TableNotFound = 4, + /// A table with this name already exists + TableAlreadyExists = 5, + /// The specified table index does not exist + TableIndexNotFound = 6, + /// A table index with this name already exists + TableIndexAlreadyExists = 7, + /// The specified table tag does not exist + TableTagNotFound = 8, + /// A table tag with this name already exists + TableTagAlreadyExists = 9, + /// The specified transaction does not exist + TransactionNotFound = 10, + /// The specified table version does not exist + TableVersionNotFound = 11, + /// The specified table column does not exist + TableColumnNotFound = 12, + /// Malformed request or invalid parameters + InvalidInput = 13, + /// Optimistic concurrency conflict + ConcurrentModification = 14, + /// User lacks permission for this operation + PermissionDenied = 15, + /// Authentication credentials are missing or invalid + Unauthenticated = 16, + /// Service is temporarily unavailable + ServiceUnavailable = 17, + /// Unexpected server/implementation error + Internal = 18, + /// Table is in an invalid state for the operation + InvalidTableState = 19, + /// Table schema validation failed + TableSchemaValidationError = 20, +} + +impl ErrorCode { + /// Returns the numeric code value. + pub fn as_u32(self) -> u32 { + self as u32 + } + + /// Creates an ErrorCode from a numeric code. + /// + /// Returns `None` if the code is not recognized. + pub fn from_u32(code: u32) -> Option { + match code { + 0 => Some(Self::Unsupported), + 1 => Some(Self::NamespaceNotFound), + 2 => Some(Self::NamespaceAlreadyExists), + 3 => Some(Self::NamespaceNotEmpty), + 4 => Some(Self::TableNotFound), + 5 => Some(Self::TableAlreadyExists), + 6 => Some(Self::TableIndexNotFound), + 7 => Some(Self::TableIndexAlreadyExists), + 8 => Some(Self::TableTagNotFound), + 9 => Some(Self::TableTagAlreadyExists), + 10 => Some(Self::TransactionNotFound), + 11 => Some(Self::TableVersionNotFound), + 12 => Some(Self::TableColumnNotFound), + 13 => Some(Self::InvalidInput), + 14 => Some(Self::ConcurrentModification), + 15 => Some(Self::PermissionDenied), + 16 => Some(Self::Unauthenticated), + 17 => Some(Self::ServiceUnavailable), + 18 => Some(Self::Internal), + 19 => Some(Self::InvalidTableState), + 20 => Some(Self::TableSchemaValidationError), + _ => None, + } + } +} + +impl std::fmt::Display for ErrorCode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let name = match self { + Self::Unsupported => "Unsupported", + Self::NamespaceNotFound => "NamespaceNotFound", + Self::NamespaceAlreadyExists => "NamespaceAlreadyExists", + Self::NamespaceNotEmpty => "NamespaceNotEmpty", + Self::TableNotFound => "TableNotFound", + Self::TableAlreadyExists => "TableAlreadyExists", + Self::TableIndexNotFound => "TableIndexNotFound", + Self::TableIndexAlreadyExists => "TableIndexAlreadyExists", + Self::TableTagNotFound => "TableTagNotFound", + Self::TableTagAlreadyExists => "TableTagAlreadyExists", + Self::TransactionNotFound => "TransactionNotFound", + Self::TableVersionNotFound => "TableVersionNotFound", + Self::TableColumnNotFound => "TableColumnNotFound", + Self::InvalidInput => "InvalidInput", + Self::ConcurrentModification => "ConcurrentModification", + Self::PermissionDenied => "PermissionDenied", + Self::Unauthenticated => "Unauthenticated", + Self::ServiceUnavailable => "ServiceUnavailable", + Self::Internal => "Internal", + Self::InvalidTableState => "InvalidTableState", + Self::TableSchemaValidationError => "TableSchemaValidationError", + }; + write!(f, "{}", name) + } +} + +/// Lance Namespace error type. +/// +/// This enum provides fine-grained error types for Lance Namespace operations. +/// Each variant corresponds to a specific error condition and has an associated +/// [`ErrorCode`] accessible via the [`code()`](NamespaceError::code) method. +/// +/// # Converting to lance_core::Error +/// +/// `NamespaceError` implements `Into`, preserving the original +/// error so it can be downcast later: +/// +/// ```rust,ignore +/// let ns_err = NamespaceError::TableNotFound { message: "...".into() }; +/// let lance_err: lance_core::Error = ns_err.into(); +/// +/// // Later, extract the original error: +/// if let lance_core::Error::Namespace { source, .. } = &lance_err { +/// if let Some(ns_err) = source.downcast_ref::() { +/// println!("Error code: {:?}", ns_err.code()); +/// } +/// } +/// ``` +#[derive(Debug, Snafu)] +#[snafu(visibility(pub))] +pub enum NamespaceError { + /// Operation not supported by this backend. + #[snafu(display("Unsupported: {message}"))] + Unsupported { message: String }, + + /// The specified namespace does not exist. + #[snafu(display("Namespace not found: {message}"))] + NamespaceNotFound { message: String }, + + /// A namespace with this name already exists. + #[snafu(display("Namespace already exists: {message}"))] + NamespaceAlreadyExists { message: String }, + + /// Namespace contains tables or child namespaces. + #[snafu(display("Namespace not empty: {message}"))] + NamespaceNotEmpty { message: String }, + + /// The specified table does not exist. + #[snafu(display("Table not found: {message}"))] + TableNotFound { message: String }, + + /// A table with this name already exists. + #[snafu(display("Table already exists: {message}"))] + TableAlreadyExists { message: String }, + + /// The specified table index does not exist. + #[snafu(display("Table index not found: {message}"))] + TableIndexNotFound { message: String }, + + /// A table index with this name already exists. + #[snafu(display("Table index already exists: {message}"))] + TableIndexAlreadyExists { message: String }, + + /// The specified table tag does not exist. + #[snafu(display("Table tag not found: {message}"))] + TableTagNotFound { message: String }, + + /// A table tag with this name already exists. + #[snafu(display("Table tag already exists: {message}"))] + TableTagAlreadyExists { message: String }, + + /// The specified transaction does not exist. + #[snafu(display("Transaction not found: {message}"))] + TransactionNotFound { message: String }, + + /// The specified table version does not exist. + #[snafu(display("Table version not found: {message}"))] + TableVersionNotFound { message: String }, + + /// The specified table column does not exist. + #[snafu(display("Table column not found: {message}"))] + TableColumnNotFound { message: String }, + + /// Malformed request or invalid parameters. + #[snafu(display("Invalid input: {message}"))] + InvalidInput { message: String }, + + /// Optimistic concurrency conflict. + #[snafu(display("Concurrent modification: {message}"))] + ConcurrentModification { message: String }, + + /// User lacks permission for this operation. + #[snafu(display("Permission denied: {message}"))] + PermissionDenied { message: String }, + + /// Authentication credentials are missing or invalid. + #[snafu(display("Unauthenticated: {message}"))] + Unauthenticated { message: String }, + + /// Service is temporarily unavailable. + #[snafu(display("Service unavailable: {message}"))] + ServiceUnavailable { message: String }, + + /// Unexpected internal error. + #[snafu(display("Internal error: {message}"))] + Internal { message: String }, + + /// Table is in an invalid state for the operation. + #[snafu(display("Invalid table state: {message}"))] + InvalidTableState { message: String }, + + /// Table schema validation failed. + #[snafu(display("Table schema validation error: {message}"))] + TableSchemaValidationError { message: String }, +} + +impl NamespaceError { + /// Returns the error code for this error. + /// + /// Use this for programmatic error handling across language boundaries. + pub fn code(&self) -> ErrorCode { + match self { + Self::Unsupported { .. } => ErrorCode::Unsupported, + Self::NamespaceNotFound { .. } => ErrorCode::NamespaceNotFound, + Self::NamespaceAlreadyExists { .. } => ErrorCode::NamespaceAlreadyExists, + Self::NamespaceNotEmpty { .. } => ErrorCode::NamespaceNotEmpty, + Self::TableNotFound { .. } => ErrorCode::TableNotFound, + Self::TableAlreadyExists { .. } => ErrorCode::TableAlreadyExists, + Self::TableIndexNotFound { .. } => ErrorCode::TableIndexNotFound, + Self::TableIndexAlreadyExists { .. } => ErrorCode::TableIndexAlreadyExists, + Self::TableTagNotFound { .. } => ErrorCode::TableTagNotFound, + Self::TableTagAlreadyExists { .. } => ErrorCode::TableTagAlreadyExists, + Self::TransactionNotFound { .. } => ErrorCode::TransactionNotFound, + Self::TableVersionNotFound { .. } => ErrorCode::TableVersionNotFound, + Self::TableColumnNotFound { .. } => ErrorCode::TableColumnNotFound, + Self::InvalidInput { .. } => ErrorCode::InvalidInput, + Self::ConcurrentModification { .. } => ErrorCode::ConcurrentModification, + Self::PermissionDenied { .. } => ErrorCode::PermissionDenied, + Self::Unauthenticated { .. } => ErrorCode::Unauthenticated, + Self::ServiceUnavailable { .. } => ErrorCode::ServiceUnavailable, + Self::Internal { .. } => ErrorCode::Internal, + Self::InvalidTableState { .. } => ErrorCode::InvalidTableState, + Self::TableSchemaValidationError { .. } => ErrorCode::TableSchemaValidationError, + } + } + + /// Creates a NamespaceError from an error code and message. + /// + /// This is useful when receiving errors from REST API or other language bindings. + pub fn from_code(code: u32, message: impl Into) -> Self { + let message = message.into(); + match ErrorCode::from_u32(code) { + Some(ErrorCode::Unsupported) => Self::Unsupported { message }, + Some(ErrorCode::NamespaceNotFound) => Self::NamespaceNotFound { message }, + Some(ErrorCode::NamespaceAlreadyExists) => Self::NamespaceAlreadyExists { message }, + Some(ErrorCode::NamespaceNotEmpty) => Self::NamespaceNotEmpty { message }, + Some(ErrorCode::TableNotFound) => Self::TableNotFound { message }, + Some(ErrorCode::TableAlreadyExists) => Self::TableAlreadyExists { message }, + Some(ErrorCode::TableIndexNotFound) => Self::TableIndexNotFound { message }, + Some(ErrorCode::TableIndexAlreadyExists) => Self::TableIndexAlreadyExists { message }, + Some(ErrorCode::TableTagNotFound) => Self::TableTagNotFound { message }, + Some(ErrorCode::TableTagAlreadyExists) => Self::TableTagAlreadyExists { message }, + Some(ErrorCode::TransactionNotFound) => Self::TransactionNotFound { message }, + Some(ErrorCode::TableVersionNotFound) => Self::TableVersionNotFound { message }, + Some(ErrorCode::TableColumnNotFound) => Self::TableColumnNotFound { message }, + Some(ErrorCode::InvalidInput) => Self::InvalidInput { message }, + Some(ErrorCode::ConcurrentModification) => Self::ConcurrentModification { message }, + Some(ErrorCode::PermissionDenied) => Self::PermissionDenied { message }, + Some(ErrorCode::Unauthenticated) => Self::Unauthenticated { message }, + Some(ErrorCode::ServiceUnavailable) => Self::ServiceUnavailable { message }, + Some(ErrorCode::Internal) => Self::Internal { message }, + Some(ErrorCode::InvalidTableState) => Self::InvalidTableState { message }, + Some(ErrorCode::TableSchemaValidationError) => { + Self::TableSchemaValidationError { message } + } + None => Self::Internal { message }, + } + } +} + +/// Converts a NamespaceError into a lance_core::Error. +/// +/// The original `NamespaceError` is preserved in the `source` field and can be +/// extracted via downcasting for programmatic error handling. +impl From for lance_core::Error { + #[track_caller] + fn from(err: NamespaceError) -> Self { + Self::Namespace { + source: Box::new(err), + location: std::panic::Location::caller().to_snafu_location(), + } + } +} + +/// Result type for namespace operations. +pub type Result = std::result::Result; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_error_code_roundtrip() { + for code in 0..=20 { + let error_code = ErrorCode::from_u32(code).unwrap(); + assert_eq!(error_code.as_u32(), code); + } + } + + #[test] + fn test_unknown_error_code() { + assert!(ErrorCode::from_u32(999).is_none()); + } + + #[test] + fn test_namespace_error_code() { + let err = NamespaceError::TableNotFound { + message: "test table".to_string(), + }; + assert_eq!(err.code(), ErrorCode::TableNotFound); + assert_eq!(err.code().as_u32(), 4); + } + + #[test] + fn test_from_code() { + let err = NamespaceError::from_code(4, "table not found"); + assert_eq!(err.code(), ErrorCode::TableNotFound); + assert!(err.to_string().contains("table not found")); + } + + #[test] + fn test_from_unknown_code() { + let err = NamespaceError::from_code(999, "unknown error"); + assert_eq!(err.code(), ErrorCode::Internal); + } + + #[test] + fn test_convert_to_lance_error() { + let ns_err = NamespaceError::TableNotFound { + message: "users".to_string(), + }; + let lance_err: lance_core::Error = ns_err.into(); + + // Verify it's a Namespace error + match &lance_err { + lance_core::Error::Namespace { source, .. } => { + // Downcast to get the original error + let downcast = source.downcast_ref::(); + assert!(downcast.is_some()); + assert_eq!(downcast.unwrap().code(), ErrorCode::TableNotFound); + } + _ => panic!("Expected Namespace error"), + } + } + + #[test] + fn test_error_display() { + let err = NamespaceError::TableNotFound { + message: "users".to_string(), + }; + assert_eq!(err.to_string(), "Table not found: users"); + } +} diff --git a/rust/lance-namespace/src/lib.rs b/rust/lance-namespace/src/lib.rs index 51bd18a2fb5..6fd9a9b7ab2 100644 --- a/rust/lance-namespace/src/lib.rs +++ b/rust/lance-namespace/src/lib.rs @@ -5,7 +5,17 @@ //! //! A Rust client for the Lance Namespace API that provides a unified interface //! for managing namespaces and tables across different backend implementations. +//! +//! # Error Handling +//! +//! This crate provides fine-grained error types through the [`error`] module. +//! Each error type has a unique numeric code that is consistent across all +//! Lance Namespace implementations (Python, Java, Rust, REST). +//! +//! See [`error::ErrorCode`] for the list of error codes and +//! [`error::NamespaceError`] for the error types. +pub mod error; pub mod namespace; pub mod schema; @@ -13,6 +23,9 @@ pub mod schema; pub use lance_core::{Error, Result}; pub use namespace::LanceNamespace; +// Re-export error types +pub use error::{ErrorCode, NamespaceError, Result as NamespaceResult}; + // Re-export reqwest client for convenience pub use lance_namespace_reqwest_client as reqwest_client; diff --git a/rust/lance-namespace/src/namespace.rs b/rust/lance-namespace/src/namespace.rs index ac2d0c8e176..3e27df15ba7 100644 --- a/rust/lance-namespace/src/namespace.rs +++ b/rust/lance-namespace/src/namespace.rs @@ -9,19 +9,29 @@ use lance_core::{Error, Result}; use snafu::Location; use lance_namespace_reqwest_client::models::{ - AlterTransactionRequest, AlterTransactionResponse, CountTableRowsRequest, - CreateEmptyTableRequest, CreateEmptyTableResponse, CreateNamespaceRequest, - CreateNamespaceResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, - CreateTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, DeregisterTableRequest, - DeregisterTableResponse, DescribeNamespaceRequest, DescribeNamespaceResponse, - DescribeTableIndexStatsRequest, DescribeTableIndexStatsResponse, DescribeTableRequest, - DescribeTableResponse, DescribeTransactionRequest, DescribeTransactionResponse, - DropNamespaceRequest, DropNamespaceResponse, DropTableRequest, DropTableResponse, - InsertIntoTableRequest, InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, - ListTableIndicesRequest, ListTableIndicesResponse, ListTablesRequest, ListTablesResponse, + AlterTableAddColumnsRequest, AlterTableAddColumnsResponse, AlterTableAlterColumnsRequest, + AlterTableAlterColumnsResponse, AlterTableDropColumnsRequest, AlterTableDropColumnsResponse, + AlterTransactionRequest, AlterTransactionResponse, AnalyzeTableQueryPlanRequest, + CountTableRowsRequest, CreateEmptyTableRequest, CreateEmptyTableResponse, + CreateNamespaceRequest, CreateNamespaceResponse, CreateTableIndexRequest, + CreateTableIndexResponse, CreateTableRequest, CreateTableResponse, + CreateTableScalarIndexResponse, CreateTableTagRequest, CreateTableTagResponse, + DeclareTableRequest, DeclareTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, + DeleteTableTagRequest, DeleteTableTagResponse, DeregisterTableRequest, DeregisterTableResponse, + DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableIndexStatsRequest, + DescribeTableIndexStatsResponse, DescribeTableRequest, DescribeTableResponse, + DescribeTransactionRequest, DescribeTransactionResponse, DropNamespaceRequest, + DropNamespaceResponse, DropTableIndexRequest, DropTableIndexResponse, DropTableRequest, + DropTableResponse, ExplainTableQueryPlanRequest, GetTableStatsRequest, GetTableStatsResponse, + GetTableTagVersionRequest, GetTableTagVersionResponse, InsertIntoTableRequest, + InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, + ListTableIndicesRequest, ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, + ListTableVersionsRequest, ListTableVersionsResponse, ListTablesRequest, ListTablesResponse, MergeInsertIntoTableRequest, MergeInsertIntoTableResponse, NamespaceExistsRequest, - QueryTableRequest, RegisterTableRequest, RegisterTableResponse, TableExistsRequest, - UpdateTableRequest, UpdateTableResponse, + QueryTableRequest, RegisterTableRequest, RegisterTableResponse, RenameTableRequest, + RenameTableResponse, RestoreTableRequest, RestoreTableResponse, TableExistsRequest, + UpdateTableRequest, UpdateTableResponse, UpdateTableSchemaMetadataRequest, + UpdateTableSchemaMetadataResponse, UpdateTableTagRequest, UpdateTableTagResponse, }; /// Base trait for Lance Namespace implementations. @@ -29,9 +39,26 @@ use lance_namespace_reqwest_client::models::{ /// This trait defines the interface that all Lance namespace implementations /// must provide. Each method corresponds to a specific operation on namespaces /// or tables. +/// +/// # Error Handling +/// +/// All operations may return the following common errors (via [`crate::NamespaceError`]): +/// +/// - [`crate::ErrorCode::Unsupported`] - Operation not supported by this backend +/// - [`crate::ErrorCode::InvalidInput`] - Invalid request parameters +/// - [`crate::ErrorCode::PermissionDenied`] - Insufficient permissions +/// - [`crate::ErrorCode::Unauthenticated`] - Invalid credentials +/// - [`crate::ErrorCode::ServiceUnavailable`] - Service temporarily unavailable +/// - [`crate::ErrorCode::Internal`] - Unexpected internal error +/// +/// See individual method documentation for operation-specific errors. #[async_trait] pub trait LanceNamespace: Send + Sync + std::fmt::Debug { /// List namespaces. + /// + /// # Errors + /// + /// Returns [`crate::ErrorCode::NamespaceNotFound`] if the parent namespace does not exist. async fn list_namespaces( &self, _request: ListNamespacesRequest, @@ -43,6 +70,10 @@ pub trait LanceNamespace: Send + Sync + std::fmt::Debug { } /// Describe a namespace. + /// + /// # Errors + /// + /// Returns [`crate::ErrorCode::NamespaceNotFound`] if the namespace does not exist. async fn describe_namespace( &self, _request: DescribeNamespaceRequest, @@ -54,6 +85,10 @@ pub trait LanceNamespace: Send + Sync + std::fmt::Debug { } /// Create a new namespace. + /// + /// # Errors + /// + /// Returns [`crate::ErrorCode::NamespaceAlreadyExists`] if a namespace with the same name already exists. async fn create_namespace( &self, _request: CreateNamespaceRequest, @@ -65,6 +100,11 @@ pub trait LanceNamespace: Send + Sync + std::fmt::Debug { } /// Drop a namespace. + /// + /// # Errors + /// + /// - [`crate::ErrorCode::NamespaceNotFound`] if the namespace does not exist. + /// - [`crate::ErrorCode::NamespaceNotEmpty`] if the namespace contains tables or child namespaces. async fn drop_namespace( &self, _request: DropNamespaceRequest, @@ -76,6 +116,10 @@ pub trait LanceNamespace: Send + Sync + std::fmt::Debug { } /// Check if a namespace exists. + /// + /// # Errors + /// + /// Returns [`crate::ErrorCode::NamespaceNotFound`] if the namespace does not exist. async fn namespace_exists(&self, _request: NamespaceExistsRequest) -> Result<()> { Err(Error::NotSupported { source: "namespace_exists not implemented".into(), @@ -160,7 +204,23 @@ pub trait LanceNamespace: Send + Sync + std::fmt::Debug { }) } + /// Declare a table (metadata only operation). + async fn declare_table(&self, _request: DeclareTableRequest) -> Result { + Err(Error::NotSupported { + source: "declare_table not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + /// Create an empty table (metadata only operation). + /// + /// # Deprecated + /// + /// Use [`declare_table`](Self::declare_table) instead. Support will be removed in 3.0.0. + #[deprecated( + since = "2.0.0", + note = "Use declare_table instead. Support will be removed in 3.0.0." + )] async fn create_empty_table( &self, _request: CreateEmptyTableRequest, @@ -277,6 +337,195 @@ pub trait LanceNamespace: Send + Sync + std::fmt::Debug { }) } + /// Create a scalar index on a table. + async fn create_table_scalar_index( + &self, + _request: CreateTableIndexRequest, + ) -> Result { + Err(Error::NotSupported { + source: "create_table_scalar_index not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Drop a table index. + async fn drop_table_index( + &self, + _request: DropTableIndexRequest, + ) -> Result { + Err(Error::NotSupported { + source: "drop_table_index not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// List all tables across all namespaces. + async fn list_all_tables(&self, _request: ListTablesRequest) -> Result { + Err(Error::NotSupported { + source: "list_all_tables not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Restore a table to a specific version. + async fn restore_table(&self, _request: RestoreTableRequest) -> Result { + Err(Error::NotSupported { + source: "restore_table not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Rename a table. + async fn rename_table(&self, _request: RenameTableRequest) -> Result { + Err(Error::NotSupported { + source: "rename_table not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// List all versions of a table. + async fn list_table_versions( + &self, + _request: ListTableVersionsRequest, + ) -> Result { + Err(Error::NotSupported { + source: "list_table_versions not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Update table schema metadata. + async fn update_table_schema_metadata( + &self, + _request: UpdateTableSchemaMetadataRequest, + ) -> Result { + Err(Error::NotSupported { + source: "update_table_schema_metadata not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Get table statistics. + async fn get_table_stats( + &self, + _request: GetTableStatsRequest, + ) -> Result { + Err(Error::NotSupported { + source: "get_table_stats not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Explain a table query plan. + async fn explain_table_query_plan( + &self, + _request: ExplainTableQueryPlanRequest, + ) -> Result { + Err(Error::NotSupported { + source: "explain_table_query_plan not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Analyze a table query plan. + async fn analyze_table_query_plan( + &self, + _request: AnalyzeTableQueryPlanRequest, + ) -> Result { + Err(Error::NotSupported { + source: "analyze_table_query_plan not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Add columns to a table. + async fn alter_table_add_columns( + &self, + _request: AlterTableAddColumnsRequest, + ) -> Result { + Err(Error::NotSupported { + source: "alter_table_add_columns not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Alter columns in a table. + async fn alter_table_alter_columns( + &self, + _request: AlterTableAlterColumnsRequest, + ) -> Result { + Err(Error::NotSupported { + source: "alter_table_alter_columns not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Drop columns from a table. + async fn alter_table_drop_columns( + &self, + _request: AlterTableDropColumnsRequest, + ) -> Result { + Err(Error::NotSupported { + source: "alter_table_drop_columns not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// List all tags for a table. + async fn list_table_tags( + &self, + _request: ListTableTagsRequest, + ) -> Result { + Err(Error::NotSupported { + source: "list_table_tags not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Get the version for a specific tag. + async fn get_table_tag_version( + &self, + _request: GetTableTagVersionRequest, + ) -> Result { + Err(Error::NotSupported { + source: "get_table_tag_version not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Create a tag for a table. + async fn create_table_tag( + &self, + _request: CreateTableTagRequest, + ) -> Result { + Err(Error::NotSupported { + source: "create_table_tag not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Delete a tag from a table. + async fn delete_table_tag( + &self, + _request: DeleteTableTagRequest, + ) -> Result { + Err(Error::NotSupported { + source: "delete_table_tag not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Update a tag for a table. + async fn update_table_tag( + &self, + _request: UpdateTableTagRequest, + ) -> Result { + Err(Error::NotSupported { + source: "update_table_tag not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + /// Return a human-readable unique identifier for this namespace instance. /// /// This is used for equality comparison and hashing when the namespace is diff --git a/rust/lance-table/src/io/commit.rs b/rust/lance-table/src/io/commit.rs index 41cd5b65002..1d20fb72bd2 100644 --- a/rust/lance-table/src/io/commit.rs +++ b/rust/lance-table/src/io/commit.rs @@ -765,20 +765,22 @@ pub async fn commit_handler_from_url( } }; let options = options.clone().unwrap_or_default(); - let storage_options = StorageOptions(options.storage_options.unwrap_or_default()); - let dynamo_endpoint = get_dynamodb_endpoint(&storage_options); - let expires_at_millis = storage_options.expires_at_millis(); - let storage_options = storage_options.as_s3_options(); + let storage_options_raw = + StorageOptions(options.storage_options().cloned().unwrap_or_default()); + let dynamo_endpoint = get_dynamodb_endpoint(&storage_options_raw); + let storage_options = storage_options_raw.as_s3_options(); let region = storage_options.get(&AmazonS3ConfigKey::Region).cloned(); + // Get accessor from the options + let accessor = options.get_accessor(); + let (aws_creds, region) = build_aws_credential( options.s3_credentials_refresh_offset, options.aws_credentials.clone(), Some(&storage_options), region, - options.storage_options_provider.clone(), - expires_at_millis, + accessor, ) .await?; diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 722ba7c97e1..7565b96b434 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -35,7 +35,8 @@ use lance_file::reader::FileReaderOptions; use lance_file::version::LanceFileVersion; use lance_index::DatasetIndexExt; use lance_io::object_store::{ - LanceNamespaceStorageOptionsProvider, ObjectStore, ObjectStoreParams, + LanceNamespaceStorageOptionsProvider, ObjectStore, ObjectStoreParams, StorageOptions, + StorageOptionsAccessor, StorageOptionsProvider, }; use lance_io::utils::{read_last_block, read_message, read_metadata_offset, read_struct}; use lance_namespace::LanceNamespace; @@ -110,7 +111,9 @@ pub use blob::BlobFile; use hash_joiner::HashJoiner; use lance_core::box_error; pub use lance_core::ROW_ID; -use lance_namespace::models::{CreateEmptyTableRequest, DescribeTableRequest}; +use lance_namespace::models::{ + CreateEmptyTableRequest, DeclareTableRequest, DeclareTableResponse, DescribeTableRequest, +}; use lance_table::feature_flags::{apply_feature_flags, can_read_dataset}; pub use schema_evolution::{ BatchInfo, BatchUDF, ColumnAlteration, NewColumnTransform, UDFCheckpointStore, @@ -811,63 +814,83 @@ impl Dataset { /// * `namespace` - The namespace to use for table management /// * `table_id` - The table identifier /// * `params` - Write parameters - /// * `ignore_namespace_table_storage_options` - If true, ignore storage options returned - /// by the namespace and only use the storage options in params. The storage options - /// provider will not be created, so credentials will not be automatically refreshed. pub async fn write_into_namespace( batches: impl RecordBatchReader + Send + 'static, namespace: Arc, table_id: Vec, mut params: Option, - ignore_namespace_table_storage_options: bool, ) -> Result { let mut write_params = params.take().unwrap_or_default(); match write_params.mode { WriteMode::Create => { - let request = CreateEmptyTableRequest { + let declare_request = DeclareTableRequest { id: Some(table_id.clone()), - location: None, - properties: None, + ..Default::default() }; - let response = - namespace - .create_empty_table(request) - .await - .map_err(|e| Error::Namespace { + // Try declare_table first, fall back to deprecated create_empty_table + // for backward compatibility with older namespace implementations. + // create_empty_table support will be removed in 3.0.0. + #[allow(deprecated)] + let response = match namespace.declare_table(declare_request).await { + Ok(resp) => resp, + Err(Error::NotSupported { .. }) => { + let fallback_request = CreateEmptyTableRequest { + id: Some(table_id.clone()), + ..Default::default() + }; + let fallback_resp = namespace + .create_empty_table(fallback_request) + .await + .map_err(|e| Error::Namespace { + source: Box::new(e), + location: location!(), + })?; + DeclareTableResponse { + transaction_id: fallback_resp.transaction_id, + location: fallback_resp.location, + storage_options: fallback_resp.storage_options, + } + } + Err(e) => { + return Err(Error::Namespace { source: Box::new(e), location: location!(), - })?; + }); + } + }; let uri = response.location.ok_or_else(|| Error::Namespace { source: Box::new(std::io::Error::other( - "Table location not found in create_empty_table response", + "Table location not found in declare_table response", )), location: location!(), })?; - // Set initial credentials and provider unless ignored - if !ignore_namespace_table_storage_options { - if let Some(namespace_storage_options) = response.storage_options { - let provider = Arc::new(LanceNamespaceStorageOptionsProvider::new( - namespace, table_id, - )); + // Set initial credentials and provider from namespace + if let Some(namespace_storage_options) = response.storage_options { + let provider: Arc = Arc::new( + LanceNamespaceStorageOptionsProvider::new(namespace, table_id), + ); - // Merge namespace storage options with any existing options - let mut merged_options = write_params - .store_params - .as_ref() - .and_then(|p| p.storage_options.clone()) - .unwrap_or_default(); - merged_options.extend(namespace_storage_options); - - let existing_params = write_params.store_params.take().unwrap_or_default(); - write_params.store_params = Some(ObjectStoreParams { - storage_options: Some(merged_options), - storage_options_provider: Some(provider), - ..existing_params - }); - } + // Merge namespace storage options with any existing options + let mut merged_options = write_params + .store_params + .as_ref() + .and_then(|p| p.storage_options().cloned()) + .unwrap_or_default(); + merged_options.extend(namespace_storage_options); + + let accessor = Arc::new(StorageOptionsAccessor::with_initial_and_provider( + merged_options, + provider, + )); + + let existing_params = write_params.store_params.take().unwrap_or_default(); + write_params.store_params = Some(ObjectStoreParams { + storage_options_accessor: Some(accessor), + ..existing_params + }); } Self::write(batches, uri.as_str(), Some(write_params)).await @@ -875,7 +898,7 @@ impl Dataset { WriteMode::Append | WriteMode::Overwrite => { let request = DescribeTableRequest { id: Some(table_id.clone()), - version: None, + ..Default::default() }; let response = namespace @@ -893,29 +916,32 @@ impl Dataset { location: location!(), })?; - // Set initial credentials and provider unless ignored - if !ignore_namespace_table_storage_options { - if let Some(namespace_storage_options) = response.storage_options { - let provider = Arc::new(LanceNamespaceStorageOptionsProvider::new( + // Set initial credentials and provider from namespace + if let Some(namespace_storage_options) = response.storage_options { + let provider: Arc = + Arc::new(LanceNamespaceStorageOptionsProvider::new( namespace.clone(), table_id.clone(), )); - // Merge namespace storage options with any existing options - let mut merged_options = write_params - .store_params - .as_ref() - .and_then(|p| p.storage_options.clone()) - .unwrap_or_default(); - merged_options.extend(namespace_storage_options); - - let existing_params = write_params.store_params.take().unwrap_or_default(); - write_params.store_params = Some(ObjectStoreParams { - storage_options: Some(merged_options), - storage_options_provider: Some(provider), - ..existing_params - }); - } + // Merge namespace storage options with any existing options + let mut merged_options = write_params + .store_params + .as_ref() + .and_then(|p| p.storage_options().cloned()) + .unwrap_or_default(); + merged_options.extend(namespace_storage_options); + + let accessor = Arc::new(StorageOptionsAccessor::with_initial_and_provider( + merged_options, + provider, + )); + + let existing_params = write_params.store_params.take().unwrap_or_default(); + write_params.store_params = Some(ObjectStoreParams { + storage_options_accessor: Some(accessor), + ..existing_params + }); } // For APPEND/OVERWRITE modes, we must open the existing dataset first @@ -923,11 +949,8 @@ impl Dataset { // assumes no dataset exists and converts the mode to CREATE. let mut builder = DatasetBuilder::from_uri(uri.as_str()); if let Some(ref store_params) = write_params.store_params { - if let Some(ref storage_options) = store_params.storage_options { - builder = builder.with_storage_options(storage_options.clone()); - } - if let Some(ref provider) = store_params.storage_options_provider { - builder = builder.with_storage_options_provider(provider.clone()); + if let Some(accessor) = &store_params.storage_options_accessor { + builder = builder.with_storage_options_accessor(accessor.clone()); } } let dataset = Arc::new(builder.load().await?); @@ -1588,11 +1611,22 @@ impl Dataset { &self.object_store } - /// Returns the storage options used when opening this dataset, if any. + /// Returns the initial storage options used when opening this dataset, if any. + /// + /// This returns the static initial options without triggering any refresh. + /// For the latest refreshed options, use [`Self::latest_storage_options`]. + #[deprecated(since = "0.25.0", note = "Use initial_storage_options() instead")] pub fn storage_options(&self) -> Option<&HashMap> { + self.initial_storage_options() + } + + /// Returns the initial storage options without triggering any refresh. + /// + /// For the latest refreshed options, use [`Self::latest_storage_options`]. + pub fn initial_storage_options(&self) -> Option<&HashMap> { self.store_params .as_ref() - .and_then(|params| params.storage_options.as_ref()) + .and_then(|params| params.storage_options()) } /// Returns the storage options provider used when opening this dataset, if any. @@ -1601,7 +1635,42 @@ impl Dataset { ) -> Option> { self.store_params .as_ref() - .and_then(|params| params.storage_options_provider.clone()) + .and_then(|params| params.storage_options_accessor.as_ref()) + .and_then(|accessor| accessor.provider().cloned()) + } + + /// Returns the unified storage options accessor for this dataset, if any. + /// + /// The accessor handles both static and dynamic storage options with automatic + /// caching and refresh. Use [`StorageOptionsAccessor::get_storage_options`] to + /// get the latest options. + pub fn storage_options_accessor(&self) -> Option> { + self.store_params + .as_ref() + .and_then(|params| params.get_accessor()) + } + + /// Returns the latest (possibly refreshed) storage options. + /// + /// If a dynamic storage options provider is configured, this will return + /// the cached options if still valid, or fetch fresh options if expired. + /// + /// For the initial static options without refresh, use [`Self::storage_options`]. + /// + /// # Returns + /// + /// - `Ok(Some(options))` - Storage options are available (static or refreshed) + /// - `Ok(None)` - No storage options were configured for this dataset + /// - `Err(...)` - Error occurred while fetching/refreshing options from provider + pub async fn latest_storage_options(&self) -> Result> { + // First check if we have an accessor (handles both static and dynamic options) + if let Some(accessor) = self.storage_options_accessor() { + let options = accessor.get_storage_options().await?; + return Ok(Some(options)); + } + + // Fallback to initial storage options if no accessor + Ok(self.initial_storage_options().cloned().map(StorageOptions)) } pub fn data_dir(&self) -> Path { diff --git a/rust/lance/src/dataset/builder.rs b/rust/lance/src/dataset/builder.rs index 16326630d23..8ee5ffa5e41 100644 --- a/rust/lance/src/dataset/builder.rs +++ b/rust/lance/src/dataset/builder.rs @@ -12,7 +12,7 @@ use lance_file::datatypes::populate_schema_dictionary; use lance_file::reader::FileReaderOptions; use lance_io::object_store::{ LanceNamespaceStorageOptionsProvider, ObjectStore, ObjectStoreParams, StorageOptions, - DEFAULT_CLOUD_IO_PARALLELISM, + StorageOptionsAccessor, DEFAULT_CLOUD_IO_PARALLELISM, }; use lance_namespace::models::DescribeTableRequest; use lance_namespace::LanceNamespace; @@ -95,8 +95,6 @@ impl DatasetBuilder { /// # Arguments /// * `namespace` - The namespace implementation to fetch table info from /// * `table_id` - The table identifier (e.g., vec!["my_table"]) - /// * `ignore_namespace_table_storage_options` - If true, storage options returned from - /// the namespace's `describe_table()` will be ignored (treated as None). Defaults to false. /// /// # Example /// ```ignore @@ -111,32 +109,21 @@ impl DatasetBuilder { /// /// // Load a dataset using storage options from namespace /// let dataset = DatasetBuilder::from_namespace( - /// namespace.clone(), - /// vec!["my_table".to_string()], - /// false, - /// ) - /// .await? - /// .load() - /// .await?; - /// - /// // Load a dataset ignoring namespace storage options - /// let dataset = DatasetBuilder::from_namespace( /// namespace, /// vec!["my_table".to_string()], - /// true, /// ) /// .await? /// .load() /// .await?; /// ``` + #[allow(deprecated)] pub async fn from_namespace( namespace: Arc, table_id: Vec, - ignore_namespace_table_storage_options: bool, ) -> Result { let request = DescribeTableRequest { id: Some(table_id.clone()), - version: None, + ..Default::default() }; let response = namespace @@ -156,17 +143,17 @@ impl DatasetBuilder { let mut builder = Self::from_uri(table_uri); - let namespace_storage_options = if ignore_namespace_table_storage_options { - None - } else { - response.storage_options - }; + // Use namespace storage options if available + let namespace_storage_options = response.storage_options; builder.storage_options_override = namespace_storage_options.clone(); - if namespace_storage_options.is_some() { - builder.options.storage_options_provider = Some(Arc::new( + if let Some(initial_opts) = namespace_storage_options { + let provider: Arc = Arc::new( LanceNamespaceStorageOptionsProvider::new(namespace, table_id), + ); + builder.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(initial_opts, provider), )); } @@ -289,7 +276,27 @@ impl DatasetBuilder { /// - [S3 options](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html#variants) /// - [Google options](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html#variants) pub fn with_storage_options(mut self, storage_options: HashMap) -> Self { - self.options.storage_options = Some(storage_options); + // Merge with existing options if accessor exists, otherwise create new static accessor + if let Some(existing) = self.options.storage_options_accessor.take() { + let mut merged = existing + .initial_storage_options() + .cloned() + .unwrap_or_default(); + merged.extend(storage_options); + if let Some(provider) = existing.provider().cloned() { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(merged, provider), + )); + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(merged), + )); + } + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(storage_options), + )); + } self } @@ -301,9 +308,25 @@ impl DatasetBuilder { /// .with_storage_option("region", "us-east-1"); /// ``` pub fn with_storage_option(mut self, key: impl AsRef, value: impl AsRef) -> Self { - let mut storage_options = self.options.storage_options.unwrap_or_default(); + let mut storage_options = self.options.storage_options().cloned().unwrap_or_default(); storage_options.insert(key.as_ref().to_string(), value.as_ref().to_string()); - self.options.storage_options = Some(storage_options); + + // Merge with existing accessor if present + if let Some(existing) = self.options.storage_options_accessor.take() { + if let Some(provider) = existing.provider().cloned() { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(storage_options, provider), + )); + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(storage_options), + )); + } + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(storage_options), + )); + } self } @@ -355,7 +378,50 @@ impl DatasetBuilder { mut self, provider: Arc, ) -> Self { - self.options.storage_options_provider = Some(provider); + // Preserve existing storage options if any + if let Some(existing) = self.options.storage_options_accessor.take() { + if let Some(initial) = existing.initial_storage_options().cloned() { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(initial, provider), + )); + } else { + self.options.storage_options_accessor = + Some(Arc::new(StorageOptionsAccessor::with_provider(provider))); + } + } else { + self.options.storage_options_accessor = + Some(Arc::new(StorageOptionsAccessor::with_provider(provider))); + } + self + } + + /// Set a unified storage options accessor for credential management + /// + /// The accessor bundles static storage options with an optional dynamic provider, + /// handling all caching and refresh logic internally. + /// + /// # Arguments + /// * `accessor` - The storage options accessor + /// + /// # Example + /// ```ignore + /// use std::sync::Arc; + /// use std::time::Duration; + /// use lance_io::object_store::StorageOptionsAccessor; + /// + /// // Create an accessor with a dynamic provider + /// let accessor = Arc::new(StorageOptionsAccessor::with_provider( + /// provider, + /// Duration::from_secs(300), // 5 minute refresh offset + /// )); + /// + /// let dataset = DatasetBuilder::from_uri("s3://bucket/table.lance") + /// .with_storage_options_accessor(accessor) + /// .load() + /// .await?; + /// ``` + pub fn with_storage_options_accessor(mut self, accessor: Arc) -> Self { + self.options.storage_options_accessor = Some(accessor); self } @@ -418,8 +484,8 @@ impl DatasetBuilder { let storage_options = self .options - .storage_options - .clone() + .storage_options() + .cloned() .map(StorageOptions::new) .unwrap_or_default(); let download_retry_count = storage_options.download_retry_count(); @@ -478,12 +544,29 @@ impl DatasetBuilder { } async fn load_impl(mut self) -> Result { - // Apply storage_options_override last to ensure namespace options take precedence + // Apply storage_options_override to merge namespace options with any existing accessor if let Some(override_opts) = self.storage_options_override.take() { - let mut merged_opts = self.options.storage_options.clone().unwrap_or_default(); + // Get existing options and merge + let mut merged_opts = self.options.storage_options().cloned().unwrap_or_default(); // Override with namespace storage options - they take precedence merged_opts.extend(override_opts); - self.options.storage_options = Some(merged_opts); + + // Update accessor with merged options + if let Some(accessor) = &self.options.storage_options_accessor { + if let Some(provider) = accessor.provider().cloned() { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(merged_opts, provider), + )); + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(merged_opts), + )); + } + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(merged_opts), + )); + } } let session = match self.session.as_ref() { diff --git a/rust/lance/src/dataset/fragment/write.rs b/rust/lance/src/dataset/fragment/write.rs index b4e96ccbe27..cf7361b5878 100644 --- a/rust/lance/src/dataset/fragment/write.rs +++ b/rust/lance/src/dataset/fragment/write.rs @@ -287,12 +287,12 @@ impl<'a> FragmentCreateBuilder<'a> { async fn existing_dataset_schema(&self) -> Result> { let mut builder = DatasetBuilder::from_uri(self.dataset_uri); - let storage_options = self + let accessor = self .write_params .and_then(|p| p.store_params.as_ref()) - .and_then(|p| p.storage_options.clone()); - if let Some(storage_options) = storage_options { - builder = builder.with_storage_options(storage_options); + .and_then(|p| p.storage_options_accessor.clone()); + if let Some(accessor) = accessor { + builder = builder.with_storage_options_accessor(accessor); } match builder.load().await { Ok(dataset) => { diff --git a/rust/lance/src/io.rs b/rust/lance/src/io.rs index 1ad45ce2d68..1113ef0a2a7 100644 --- a/rust/lance/src/io.rs +++ b/rust/lance/src/io.rs @@ -9,6 +9,9 @@ pub mod exec; pub use lance_io::{ bytes_read_counter, iops_counter, - object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry, WrappingObjectStore}, + object_store::{ + ObjectStore, ObjectStoreParams, ObjectStoreRegistry, StorageOptionsAccessor, + WrappingObjectStore, + }, stream::RecordBatchStream, }; diff --git a/rust/lance/src/io/commit/s3_test.rs b/rust/lance/src/io/commit/s3_test.rs index 35e64703688..1402fb25d46 100644 --- a/rust/lance/src/io/commit/s3_test.rs +++ b/rust/lance/src/io/commit/s3_test.rs @@ -8,7 +8,7 @@ use crate::{ dataset::{ builder::DatasetBuilder, CommitBuilder, InsertBuilder, ReadParams, WriteMode, WriteParams, }, - io::ObjectStoreParams, + io::{ObjectStoreParams, StorageOptionsAccessor}, }; use aws_config::{BehaviorVersion, ConfigLoader, Region, SdkConfig}; use aws_sdk_s3::{config::Credentials, Client as S3Client}; @@ -186,12 +186,12 @@ async fn test_concurrent_writers() { // Create a table let store_params = ObjectStoreParams { object_store_wrapper: Some(io_tracker.clone()), - storage_options: Some( + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( CONFIG .iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect(), - ), + ))), ..Default::default() }; let write_params = WriteParams { @@ -270,12 +270,12 @@ async fn test_ddb_open_iops() { // Create a table let store_params = ObjectStoreParams { object_store_wrapper: Some(io_tracker.clone()), - storage_options: Some( + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( CONFIG .iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect(), - ), + ))), ..Default::default() }; let write_params = WriteParams {