From 8dc1662fa69244dd85b390f3ec3b698357e0c856 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Fri, 19 Dec 2025 00:07:12 -0800 Subject: [PATCH 1/2] refactor --- java/lance-jni/Cargo.lock | 1 + java/lance-jni/src/blocking_dataset.rs | 100 +- java/lance-jni/src/file_reader.rs | 6 +- java/lance-jni/src/file_writer.rs | 6 +- java/lance-jni/src/transaction.rs | 17 +- java/lance-jni/src/utils.rs | 23 +- java/src/main/java/org/lance/Dataset.java | 18 + .../java/org/lance/OpenDatasetBuilder.java | 48 +- .../java/org/lance/WriteDatasetBuilder.java | 67 +- python/python/lance/__init__.py | 49 +- python/python/lance/dataset.py | 148 ++- python/src/dataset.rs | 160 ++- python/src/file.rs | 32 +- rust/lance-io/src/object_store.rs | 153 +-- rust/lance-io/src/object_store/providers.rs | 2 +- .../src/object_store/providers/aws.rs | 128 +- .../src/object_store/providers/azure.rs | 121 +- .../src/object_store/providers/gcp.rs | 96 +- .../src/object_store/providers/huggingface.rs | 38 +- .../src/object_store/providers/local.rs | 6 +- .../src/object_store/providers/memory.rs | 8 +- .../src/object_store/providers/oss.rs | 25 +- .../src/object_store/storage_options.rs | 1174 +++++++++++++++++ rust/lance-namespace-impls/src/dir.rs | 12 +- .../lance-namespace-impls/src/dir/manifest.rs | 6 +- rust/lance-table/src/io/commit.rs | 33 +- rust/lance/src/dataset.rs | 99 +- rust/lance/src/dataset/builder.rs | 168 ++- rust/lance/src/dataset/fragment/write.rs | 17 +- rust/lance/src/dataset/write/commit.rs | 9 +- rust/lance/src/dataset/write/insert.rs | 18 +- 31 files changed, 2120 insertions(+), 668 deletions(-) diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 86319138591..156a118e70d 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -3518,6 +3518,7 @@ dependencies = [ "half", "hex", "rand 0.9.2", + "rand_distr 0.5.1", "rand_xoshiro", "random_word", ] diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index 55b108dd9cc..1357b6b36ba 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -44,6 +44,7 @@ use lance_index::scalar::lance_format::LanceIndexStore; use lance_index::DatasetIndexExt; use lance_index::{IndexParams, IndexType}; use lance_io::object_store::ObjectStoreRegistry; +use lance_io::object_store::StorageOptionsAccessor; use lance_io::object_store::StorageOptionsProvider; use std::collections::HashMap; use std::future::IntoFuture; @@ -75,16 +76,18 @@ pub struct BlockingDataset { } impl BlockingDataset { - /// Get the storage options provider that was used when opening this dataset - pub fn get_storage_options_provider(&self) -> Option> { - self.inner.storage_options_provider() + /// Get the storage options accessor that was used when opening this dataset + pub fn get_storage_options_accessor(&self) -> Option> { + self.inner.storage_options_accessor() } pub fn drop(uri: &str, storage_options: HashMap) -> Result<()> { RT.block_on(async move { let registry = Arc::new(ObjectStoreRegistry::default()); let object_store_params = ObjectStoreParams { - storage_options: Some(storage_options.clone()), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::new_with_options( + storage_options.clone(), + ))), ..Default::default() }; let (object_store, path) = @@ -118,18 +121,31 @@ impl BlockingDataset { storage_options_provider: Option>, s3_credentials_refresh_offset_seconds: Option, ) -> Result { - let mut store_params = ObjectStoreParams { + let s3_credentials_refresh_offset = s3_credentials_refresh_offset_seconds + .map(std::time::Duration::from_secs) + .unwrap_or(std::time::Duration::from_secs(60)); + + // Create StorageOptionsAccessor with options and optional provider + let storage_options_accessor = match storage_options_provider { + Some(provider) => Some(Arc::new( + StorageOptionsAccessor::new_with_options_and_provider( + storage_options.clone(), + provider, + s3_credentials_refresh_offset, + ), + )), + None => Some(Arc::new(StorageOptionsAccessor::new_with_options( + storage_options.clone(), + ))), + }; + + let store_params = ObjectStoreParams { block_size: block_size.map(|size| size as usize), - storage_options: Some(storage_options.clone()), + storage_options_accessor: storage_options_accessor.clone(), + s3_credentials_refresh_offset, ..Default::default() }; - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - store_params.s3_credentials_refresh_offset = - std::time::Duration::from_secs(offset_seconds); - } - if let Some(provider) = storage_options_provider.clone() { - store_params.storage_options_provider = Some(provider); - } + let params = ReadParams { index_cache_size_bytes: index_cache_size_bytes as usize, metadata_cache_size_bytes: metadata_cache_size_bytes as usize, @@ -143,8 +159,8 @@ impl BlockingDataset { builder = builder.with_version(ver as u64); } builder = builder.with_storage_options(storage_options); - if let Some(provider) = storage_options_provider.clone() { - builder = builder.with_storage_options_provider(provider) + if let Some(accessor) = storage_options_accessor { + builder = builder.with_storage_options_accessor(accessor) } if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { builder = builder @@ -170,7 +186,9 @@ impl BlockingDataset { operation, read_version, Some(ObjectStoreParams { - storage_options: Some(storage_options), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::new_with_options( + storage_options, + ))), ..Default::default() }), None, @@ -1394,7 +1412,9 @@ fn inner_shallow_clone<'local>( storage_options .map(|options| { Some(ObjectStoreParams { - storage_options: Some(options), + storage_options_accessor: Some(Arc::new( + StorageOptionsAccessor::new_with_options(options), + )), ..Default::default() }) }) @@ -1546,6 +1566,52 @@ fn inner_get_config<'local>( Ok(java_hashmap) } +#[no_mangle] +pub extern "system" fn Java_org_lance_Dataset_nativeGetCurrentStorageOptions<'local>( + mut env: JNIEnv<'local>, + java_dataset: JObject, +) -> JObject<'local> { + ok_or_throw!( + env, + inner_get_current_storage_options(&mut env, java_dataset) + ) +} + +fn inner_get_current_storage_options<'local>( + env: &mut JNIEnv<'local>, + java_dataset: JObject, +) -> Result> { + let options = { + let dataset_guard = + unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; + RT.block_on(dataset_guard.inner.current_storage_options()) + .map_err(|e| Error::io_error(e.to_string()))? + }; + + let java_hashmap = env + .new_object("java/util/HashMap", "()V", &[]) + .expect("Failed to create Java HashMap"); + + for (k, v) in options { + let java_key = env + .new_string(&k) + .expect("Failed to create Java String (key)"); + let java_value = env + .new_string(&v) + .expect("Failed to create Java String (value)"); + + env.call_method( + &java_hashmap, + "put", + "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;", + &[JValue::Object(&java_key), JValue::Object(&java_value)], + ) + .expect("Failed to call HashMap.put()"); + } + + Ok(java_hashmap) +} + #[no_mangle] pub extern "system" fn Java_org_lance_Dataset_nativeTake( mut env: JNIEnv, diff --git a/java/lance-jni/src/file_reader.rs b/java/lance-jni/src/file_reader.rs index 11591b3acea..eca7c90f97b 100644 --- a/java/lance-jni/src/file_reader.rs +++ b/java/lance-jni/src/file_reader.rs @@ -23,7 +23,7 @@ use lance_core::cache::LanceCache; use lance_core::datatypes::Schema; use lance_encoding::decoder::{DecoderPlugins, FilterExpression}; use lance_file::reader::{FileReader, FileReaderOptions, ReaderProjection}; -use lance_io::object_store::{ObjectStoreParams, ObjectStoreRegistry}; +use lance_io::object_store::{ObjectStoreParams, ObjectStoreRegistry, StorageOptionsAccessor}; use lance_io::{ scheduler::{ScanScheduler, SchedulerConfig}, utils::CachedFileSize, @@ -112,7 +112,9 @@ fn inner_open<'local>( let storage_options = to_rust_map(env, &jmap)?; let reader = RT.block_on(async move { let object_params = ObjectStoreParams { - storage_options: Some(storage_options), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::new_with_options( + storage_options, + ))), ..Default::default() }; let (obj_store, path) = ObjectStore::from_uri_and_params( diff --git a/java/lance-jni/src/file_writer.rs b/java/lance-jni/src/file_writer.rs index 600d7de2845..db85ec96a1a 100644 --- a/java/lance-jni/src/file_writer.rs +++ b/java/lance-jni/src/file_writer.rs @@ -25,7 +25,7 @@ use lance_file::{ version::LanceFileVersion, writer::{FileWriter, FileWriterOptions}, }; -use lance_io::object_store::{ObjectStoreParams, ObjectStoreRegistry}; +use lance_io::object_store::{ObjectStoreParams, ObjectStoreRegistry, StorageOptionsAccessor}; pub const NATIVE_WRITER: &str = "nativeFileWriterHandle"; @@ -94,7 +94,9 @@ fn inner_open<'local>( let writer = RT.block_on(async move { let object_params = ObjectStoreParams { - storage_options: Some(storage_options), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::new_with_options( + storage_options, + ))), ..Default::default() }; let (obj_store, path) = ObjectStore::from_uri_and_params( diff --git a/java/lance-jni/src/transaction.rs b/java/lance-jni/src/transaction.rs index 9b077e7498c..82d6d994aa7 100644 --- a/java/lance-jni/src/transaction.rs +++ b/java/lance-jni/src/transaction.rs @@ -18,6 +18,7 @@ use lance::dataset::transaction::{ UpdateMap, UpdateMapEntry, UpdateMode, }; use lance::io::ObjectStoreParams; +use lance_io::object_store::StorageOptionsAccessor; use lance::table::format::{Fragment, IndexMetadata}; use lance_core::datatypes::Schema as LanceSchema; use prost::Message; @@ -752,17 +753,21 @@ fn inner_commit_transaction<'local>( .map(std::time::Duration::from_secs) .unwrap_or_else(|| std::time::Duration::from_secs(10)); - // Get the Dataset's storage_options_provider - let storage_options_provider = { + // Get the Dataset's storage_options_accessor + let existing_accessor = { let dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(&java_dataset, NATIVE_DATASET) }?; - dataset_guard.get_storage_options_provider() + dataset_guard.get_storage_options_accessor() }; - // Build ObjectStoreParams using write_param for storage_options and provider from Dataset + // Build ObjectStoreParams using write_param for storage_options and accessor from Dataset + let storage_options_accessor = Some(Arc::new(existing_accessor.map_or_else( + || StorageOptionsAccessor::new_with_options(write_param.clone()), + |a| a.with_updated_initial_options(write_param.clone(), s3_credentials_refresh_offset), + ))); + let store_params = ObjectStoreParams { - storage_options: Some(write_param), - storage_options_provider, + storage_options_accessor, s3_credentials_refresh_offset, ..Default::default() }; diff --git a/java/lance-jni/src/utils.rs b/java/lance-jni/src/utils.rs index 5cb55c200e1..84a43086cfa 100644 --- a/java/lance-jni/src/utils.rs +++ b/java/lance-jni/src/utils.rs @@ -25,7 +25,7 @@ use crate::storage_options::JavaStorageOptionsProvider; use crate::traits::FromJObjectWithEnv; use lance_index::vector::Query; -use lance_io::object_store::StorageOptionsProvider; +use lance_io::object_store::{StorageOptionsAccessor, StorageOptionsProvider}; use std::collections::HashMap; use std::str::FromStr; @@ -84,15 +84,27 @@ pub fn extract_write_params( JavaStorageOptionsProvider::new(env, provider_obj) })?; - let storage_options_provider_arc: Option> = - storage_options_provider.map(|v| Arc::new(v) as Arc); - // Extract s3_credentials_refresh_offset_seconds if present let s3_credentials_refresh_offset = env .get_long_opt(s3_credentials_refresh_offset_seconds_obj)? .map(|v| std::time::Duration::from_secs(v as u64)) .unwrap_or_else(|| std::time::Duration::from_secs(10)); + // Create StorageOptionsAccessor with options and optional provider + let storage_options_accessor = match storage_options_provider { + Some(provider) => { + let provider_arc = Arc::new(provider) as Arc; + Some(Arc::new(StorageOptionsAccessor::new_with_options_and_provider( + storage_options.clone(), + provider_arc, + s3_credentials_refresh_offset, + ))) + } + None => Some(Arc::new(StorageOptionsAccessor::new_with_options( + storage_options.clone(), + ))), + }; + if let Some(initial_bases) = env.get_list_opt(initial_bases, |env, elem| elem.extract_object(env))? { @@ -104,8 +116,7 @@ pub fn extract_write_params( } write_params.store_params = Some(ObjectStoreParams { - storage_options: Some(storage_options), - storage_options_provider: storage_options_provider_arc, + storage_options_accessor, s3_credentials_refresh_offset, ..Default::default() }); diff --git a/java/src/main/java/org/lance/Dataset.java b/java/src/main/java/org/lance/Dataset.java index 6b5983bfb08..a53a3a39291 100644 --- a/java/src/main/java/org/lance/Dataset.java +++ b/java/src/main/java/org/lance/Dataset.java @@ -995,6 +995,24 @@ public Map getConfig() { private native Map nativeGetConfig(); + /** + * Get the current storage options. + * + *

Returns a map combining the initial storage_options with any overrides from the + * storage_options_provider. Options from the provider are cached until they expire (based on + * expires_at_millis). + * + * @return the current storage options + */ + public Map getCurrentStorageOptions() { + try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) { + Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed"); + return nativeGetCurrentStorageOptions(); + } + } + + private native Map nativeGetCurrentStorageOptions(); + /** * Compact the dataset to improve performance. * diff --git a/java/src/main/java/org/lance/OpenDatasetBuilder.java b/java/src/main/java/org/lance/OpenDatasetBuilder.java index ae082e14ceb..9c07b88bc94 100644 --- a/java/src/main/java/org/lance/OpenDatasetBuilder.java +++ b/java/src/main/java/org/lance/OpenDatasetBuilder.java @@ -30,10 +30,22 @@ * Builder for opening a Dataset. * *

This builder provides a fluent API for opening datasets either directly from a URI or from a - * LanceNamespace. When using a namespace, the table location and storage options are automatically - * fetched. + * LanceNamespace. * - *

Example usage with URI: + *

Namespace Behavior

+ * + *

When {@code namespace} and {@code tableId} are provided (instead of {@code uri}), the builder + * automatically: + * + *

    + *
  1. Calls {@code describeTable()} on the namespace to get table metadata + *
  2. Uses the returned location as the dataset URI + *
  3. Merges namespace storage options with any user-provided storage options (namespace options + * take precedence) + *
  4. Creates a {@link LanceNamespaceStorageOptionsProvider} for automatic credential refresh + *
+ * + *

Example: URI only

* *
{@code
  * Dataset dataset = Dataset.open()
@@ -42,7 +54,7 @@
  *     .build();
  * }
* - *

Example usage with namespace: + *

Example: Namespace

* *
{@code
  * Dataset dataset = Dataset.open()
@@ -50,6 +62,22 @@
  *     .tableId(Arrays.asList("my_table"))
  *     .build();
  * }
+ * + *

Example: URI with custom storage options provider

+ * + *

If you need to use a specific URI with namespace-based credential refresh, create the provider + * manually: + * + *

{@code
+ * LanceNamespaceStorageOptionsProvider provider =
+ *     new LanceNamespaceStorageOptionsProvider(myNamespace, Arrays.asList("my_table"));
+ * Dataset dataset = Dataset.open()
+ *     .uri("s3://bucket/table.lance")
+ *     .readOptions(new ReadOptions.Builder()
+ *         .setStorageOptionsProvider(provider)
+ *         .build())
+ *     .build();
+ * }
*/ public class OpenDatasetBuilder { private BufferAllocator allocator; @@ -213,9 +241,15 @@ private Dataset buildFromNamespace() { .setMetadataCacheSizeBytes(options.getMetadataCacheSizeBytes()); if (namespaceStorageOptions != null && !namespaceStorageOptions.isEmpty()) { - LanceNamespaceStorageOptionsProvider storageOptionsProvider = - new LanceNamespaceStorageOptionsProvider(namespace, tableId); - optionsBuilder.setStorageOptionsProvider(storageOptionsProvider); + // Only create namespace provider if user didn't provide one explicitly via ReadOptions + if (!options.getStorageOptionsProvider().isPresent()) { + LanceNamespaceStorageOptionsProvider storageOptionsProvider = + new LanceNamespaceStorageOptionsProvider(namespace, tableId); + optionsBuilder.setStorageOptionsProvider(storageOptionsProvider); + } else { + // User provided their own provider - use it + optionsBuilder.setStorageOptionsProvider(options.getStorageOptionsProvider().get()); + } } options.getVersion().ifPresent(optionsBuilder::setVersion); diff --git a/java/src/main/java/org/lance/WriteDatasetBuilder.java b/java/src/main/java/org/lance/WriteDatasetBuilder.java index 6f49ce9338a..9ea7a0bed07 100644 --- a/java/src/main/java/org/lance/WriteDatasetBuilder.java +++ b/java/src/main/java/org/lance/WriteDatasetBuilder.java @@ -41,7 +41,7 @@ * URI or through a LanceNamespace. When using a namespace, the table location and storage options * are automatically managed with credential vending support. * - *

Example usage with URI and reader: + *

Example: URI only

* *
{@code
  * Dataset dataset = Dataset.write(allocator)
@@ -51,7 +51,7 @@
  *     .execute();
  * }
* - *

Example usage with namespace: + *

Example: Namespace

* *
{@code
  * Dataset dataset = Dataset.write(allocator)
@@ -61,6 +61,21 @@
  *     .mode(WriteMode.CREATE)
  *     .execute();
  * }
+ * + *

Example: URI with custom storage options provider

+ * + *

If you need to use a specific URI with namespace-based credential refresh: + * + *

{@code
+ * LanceNamespaceStorageOptionsProvider provider =
+ *     new LanceNamespaceStorageOptionsProvider(myNamespace, Arrays.asList("my_table"));
+ * Dataset dataset = Dataset.write(allocator)
+ *     .reader(myReader)
+ *     .uri("s3://bucket/table.lance")
+ *     .storageOptionsProvider(provider)
+ *     .mode(WriteMode.CREATE)
+ *     .execute();
+ * }
*/ public class WriteDatasetBuilder { private BufferAllocator allocator; @@ -81,6 +96,7 @@ public class WriteDatasetBuilder { private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); private Optional> initialBases = Optional.empty(); private Optional> targetBases = Optional.empty(); + private StorageOptionsProvider storageOptionsProvider = null; /** Creates a new builder instance. Package-private, use Dataset.write() instead. */ WriteDatasetBuilder() { @@ -299,6 +315,34 @@ public WriteDatasetBuilder targetBases(List targetBases) { return this; } + /** + * Sets a custom storage options provider for dynamic credential fetching. + * + *

This allows using a specific URI with namespace-based credential refresh. If both this + * method and namespace()+tableId() are called, the explicitly provided storageOptionsProvider + * takes precedence. + * + *

Example: + * + *

{@code
+   * LanceNamespaceStorageOptionsProvider provider =
+   *     new LanceNamespaceStorageOptionsProvider(myNamespace, Arrays.asList("my_table"));
+   * Dataset dataset = Dataset.write(allocator)
+   *     .reader(myReader)
+   *     .uri("s3://bucket/table.lance")
+   *     .storageOptionsProvider(provider)
+   *     .mode(WriteMode.CREATE)
+   *     .execute();
+   * }
+ * + * @param storageOptionsProvider The storage options provider + * @return this builder instance + */ + public WriteDatasetBuilder storageOptionsProvider(StorageOptionsProvider storageOptionsProvider) { + this.storageOptionsProvider = storageOptionsProvider; + return this; + } + /** * Executes the write operation and returns the created dataset. * @@ -416,13 +460,19 @@ private Dataset executeWithNamespace() { WriteParams params = paramsBuilder.build(); // Create storage options provider for credential refresh during long-running writes - StorageOptionsProvider storageOptionsProvider = - ignoreNamespaceStorageOptions - ? null - : new LanceNamespaceStorageOptionsProvider(namespace, tableId); + // Only create namespace provider if user didn't provide one explicitly + StorageOptionsProvider effectiveProvider; + if (storageOptionsProvider != null) { + // User provided their own provider - use it + effectiveProvider = storageOptionsProvider; + } else if (!ignoreNamespaceStorageOptions) { + effectiveProvider = new LanceNamespaceStorageOptionsProvider(namespace, tableId); + } else { + effectiveProvider = null; + } // Use Dataset.create() which handles CREATE/APPEND/OVERWRITE modes - return createDatasetWithStream(tableUri, params, storageOptionsProvider); + return createDatasetWithStream(tableUri, params, effectiveProvider); } private Dataset executeWithUri() { @@ -441,7 +491,8 @@ private Dataset executeWithUri() { WriteParams params = paramsBuilder.build(); - return createDatasetWithStream(uri, params, null); + // Pass user-provided storage options provider if set + return createDatasetWithStream(uri, params, storageOptionsProvider); } private Dataset createDatasetWithStream( diff --git a/python/python/lance/__init__.py b/python/python/lance/__init__.py index ff2870b6d1a..a0a074f357b 100644 --- a/python/python/lance/__init__.py +++ b/python/python/lance/__init__.py @@ -46,6 +46,7 @@ from lance.commit import CommitLock from lance.dependencies import pandas as pd + from lance.io import StorageOptionsProvider ts_types = Union[datetime, pd.Timestamp, str] @@ -101,6 +102,7 @@ def dataset( table_id: Optional[List[str]] = None, ignore_namespace_table_storage_options: bool = False, s3_credentials_refresh_offset_seconds: Optional[int] = None, + storage_options_provider: Optional[StorageOptionsProvider] = None, ) -> LanceDataset: """ Opens the Lance dataset from the address specified. @@ -181,15 +183,41 @@ def dataset( when they have less than 60 seconds remaining before expiration. This should be set shorter than the credential lifetime to avoid using expired credentials. + storage_options_provider : optional, StorageOptionsProvider + A custom storage options provider for dynamic credential fetching. + This allows using a specific URI with namespace-based credential refresh. + If both `storage_options_provider` and `namespace`+`table_id` are provided, + the explicitly provided `storage_options_provider` takes precedence. Notes ----- - When using `namespace` and `table_id`: - - The `uri` parameter is optional and will be fetched from the namespace - - Storage options from describe_table() will be used unless - `ignore_namespace_table_storage_options=True` - - Initial storage options from describe_table() will be merged with - any provided `storage_options` + **Namespace Behavior:** + + When `namespace` and `table_id` are provided (instead of `uri`), the function + automatically: + + 1. Calls ``describe_table()`` on the namespace to get table metadata + 2. Uses the returned location as the dataset URI + 3. Merges namespace storage options with any user-provided storage options + (namespace options take precedence) + 4. Creates a ``LanceNamespaceStorageOptionsProvider`` for automatic credential + refresh + + **Using URI with Custom Storage Options Provider:** + + If you need to use a specific URI with namespace-based credential refresh, + pass both the URI and a storage options provider:: + + from lance.namespace import LanceNamespaceStorageOptionsProvider + + provider = LanceNamespaceStorageOptionsProvider( + namespace=my_namespace, table_id=["my_table"] + ) + ds = lance.dataset( + uri="s3://bucket/table.lance", + storage_options_provider=provider, + ) + """ # Validate that user provides either uri OR (namespace + table_id), not both has_uri = uri is not None @@ -206,7 +234,6 @@ def dataset( ) # Handle namespace resolution in Python - storage_options_provider = None if namespace is not None: if table_id is None: raise ValueError( @@ -226,9 +253,11 @@ def dataset( namespace_storage_options = response.storage_options if namespace_storage_options: - storage_options_provider = LanceNamespaceStorageOptionsProvider( - namespace=namespace, table_id=table_id - ) + # Only create namespace provider if user didn't provide one explicitly + if storage_options_provider is None: + storage_options_provider = LanceNamespaceStorageOptionsProvider( + namespace=namespace, table_id=table_id + ) if storage_options is None: storage_options = namespace_storage_options else: diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 048e231d76b..041d8628203 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -951,6 +951,21 @@ def max_field_id(self) -> int: """ return self._ds.max_field_id + def current_storage_options(self) -> Dict[str, str]: + """Get the current storage options. + + Returns a dictionary combining the initial storage_options with any + overrides from the storage_options_provider. Provider options take + precedence when keys conflict. + + The options from the provider are cached and automatically refreshed + when they expire (based on the `expires_at_millis` key). + + If neither storage_options nor storage_options_provider were specified + when opening the dataset, an empty dictionary is returned. + """ + return self._ds.current_storage_options() + def to_table( self, columns: Optional[Union[List[str], Dict[str, str]]] = None, @@ -5371,6 +5386,7 @@ def write_dataset( table_id: Optional[List[str]] = None, ignore_namespace_table_storage_options: bool = False, s3_credentials_refresh_offset_seconds: Optional[int] = None, + storage_options_provider: Optional[StorageOptionsProvider] = None, ) -> LanceDataset: """Write a given data_obj to the given uri @@ -5383,7 +5399,8 @@ def write_dataset( uri: str, Path, LanceDataset, or None Where to write the dataset to (directory). If a LanceDataset is passed, the session will be reused. - Either `uri` or (`namespace` + `table_id`) must be provided, but not both. + If not provided, and `namespace` + `table_id` are given, the URI will be + fetched from the namespace. schema: Schema, optional If specified and the input is a pandas DataFrame, use this schema instead of the default pandas to arrow table conversion. @@ -5464,14 +5481,15 @@ def write_dataset( **CREATE mode**: References must match bases in `initial_bases` **APPEND/OVERWRITE modes**: References must match bases in the existing manifest namespace : optional, LanceNamespace - A namespace instance from which to fetch table location and storage options. - Must be provided together with `table_id`. Cannot be used with `uri`. - When provided, the table location will be fetched automatically from the - namespace via describe_table(). Storage options will be automatically refreshed - before they expire. + A namespace instance from which to fetch table uri and storage options. + Must be provided together with `table_id`. Either `uri` or + `namespace`+`table_id` must be provided, but not both. + When provided, the table location will be fetched from the namespace. + Storage options will be automatically refreshed before they expire. table_id : optional, List[str] The table identifier when using a namespace (e.g., ["my_table"]). - Must be provided together with `namespace`. Cannot be used with `uri`. + Must be provided together with `namespace`. Either `uri` or + `namespace`+`table_id` must be provided, but not both. ignore_namespace_table_storage_options : bool, default False If True, ignore the storage options returned by the namespace and only use the provided `storage_options` parameter. The storage options provider will @@ -5485,51 +5503,79 @@ def write_dataset( when they have less than 60 seconds remaining before expiration. This should be set shorter than the credential lifetime to avoid using expired credentials. + storage_options_provider : optional, StorageOptionsProvider + A custom storage options provider for dynamic credential fetching. + This allows using a specific URI with namespace-based credential refresh. + If both `storage_options_provider` and `namespace`+`table_id` are provided, + the explicitly provided `storage_options_provider` takes precedence. Notes ----- - When using `namespace` and `table_id`: - - The `uri` parameter is optional and will be fetched from the namespace - - A `LanceNamespaceStorageOptionsProvider` will be created automatically for - storage options refresh (unless `ignore_namespace_table_storage_options=True`) - - Initial storage options from describe_table() will be merged with - any provided `storage_options` (unless - `ignore_namespace_table_storage_options=True`) + **Namespace Behavior:** + + When `namespace` and `table_id` are provided (instead of `uri`), the function + automatically: + + 1. Calls ``describe_table()`` (or ``create_empty_table()`` for create mode) + on the namespace to get table metadata + 2. Uses the returned location as the dataset URI + 3. Merges namespace storage options with any user-provided storage options + (namespace options take precedence) + 4. Creates a ``LanceNamespaceStorageOptionsProvider`` for automatic credential + refresh + + **Using URI with Custom Storage Options Provider:** + + If you need to use a specific URI with namespace-based credential refresh, + pass both the URI and a storage options provider:: + + from lance.namespace import LanceNamespaceStorageOptionsProvider + + provider = LanceNamespaceStorageOptionsProvider( + namespace=my_namespace, table_id=["my_table"] + ) + lance.write_dataset( + data, + uri="s3://bucket/table.lance", + storage_options_provider=provider, + ) + """ - # Validate that user provides either uri OR (namespace + table_id), not both + # Validate that exactly one of uri or namespace+tableId is provided has_uri = uri is not None - has_namespace = namespace is not None or table_id is not None + has_namespace = namespace is not None and table_id is not None if has_uri and has_namespace: raise ValueError( - "Cannot specify both 'uri' and 'namespace/table_id'. " - "Please provide either 'uri' or both 'namespace' and 'table_id'." - ) - elif not has_uri and not has_namespace: - raise ValueError( - "Must specify either 'uri' or both 'namespace' and 'table_id'." + "Cannot specify both 'uri' and 'namespace'+'table_id'. Use one or the " + "other. If you need to use a specific URI with namespace-based credential " + "refresh, create a LanceNamespaceStorageOptionsProvider manually and pass " + "it via storage_options_provider." ) - - # Handle namespace-based dataset writing - if namespace is not None: - if table_id is None: + if not has_uri and not has_namespace: + if namespace is not None: raise ValueError( - "Both 'namespace' and 'table_id' must be provided together." + "'namespace' is set but 'table_id' is missing. Both 'namespace' and " + "'table_id' must be provided together." ) + elif table_id is not None: + raise ValueError( + "'table_id' is set but 'namespace' is missing. Both 'namespace' and " + "'table_id' must be provided together." + ) + else: + raise ValueError("Either 'uri' or 'namespace'+'table_id' must be provided.") - # Implement write_into_namespace logic in Python - # This follows the same pattern as the Rust implementation: - # - CREATE mode: calls namespace.create_empty_table() - # - APPEND/OVERWRITE mode: calls namespace.describe_table() - # - Both modes: create storage options provider and merge storage options - + # Handle namespace-based storage options + storage_options_provider = None + if has_namespace: from .namespace import ( CreateEmptyTableRequest, DescribeTableRequest, LanceNamespaceStorageOptionsProvider, ) - # Determine which namespace method to call based on mode + # Fetch location from namespace if mode == "create": request = CreateEmptyTableRequest( id=table_id, location=None, properties=None @@ -5541,40 +5587,36 @@ def write_dataset( else: raise ValueError(f"Invalid mode: {mode}") - # Get table location from response uri = response.location if not uri: raise ValueError( f"Namespace did not return a table location in {mode} response" ) - # Check if we should ignore namespace storage options + # Get storage options from response if ignore_namespace_table_storage_options: namespace_storage_options = None else: namespace_storage_options = response.storage_options # Set up storage options and provider - if namespace_storage_options: + if not ignore_namespace_table_storage_options: # Create the storage options provider for automatic refresh - storage_options_provider = LanceNamespaceStorageOptionsProvider( - namespace=namespace, table_id=table_id - ) + # Only if user didn't provide one explicitly + if storage_options_provider is None: + storage_options_provider = LanceNamespaceStorageOptionsProvider( + namespace=namespace, table_id=table_id + ) # Merge namespace storage options with any existing options # Namespace options take precedence (same as Rust implementation) - if storage_options is None: - storage_options = dict(namespace_storage_options) - else: - merged_options = dict(storage_options) - merged_options.update(namespace_storage_options) - storage_options = merged_options - else: - storage_options_provider = None - elif table_id is not None: - raise ValueError("Both 'namespace' and 'table_id' must be provided together.") - else: - storage_options_provider = None + if namespace_storage_options: + if storage_options is None: + storage_options = dict(namespace_storage_options) + else: + merged_options = dict(storage_options) + merged_options.update(namespace_storage_options) + storage_options = merged_options if use_legacy_format is not None: warnings.warn( @@ -5611,7 +5653,7 @@ def write_dataset( "target_bases": target_bases, } - # Add storage_options_provider if created from namespace + # Add storage_options_provider if provided (either from namespace or user) if storage_options_provider is not None: params["storage_options_provider"] = storage_options_provider diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 4c8f84287fa..2cd5927be5f 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -79,7 +79,7 @@ use lance_index::{ }, DatasetIndexExt, IndexParams, IndexType, }; -use lance_io::object_store::ObjectStoreParams; +use lance_io::object_store::{ObjectStoreParams, StorageOptionsAccessor}; use lance_linalg::distance::MetricType; use lance_table::format::{BasePath, Fragment}; use lance_table::io::commit::CommitHandler; @@ -645,6 +645,16 @@ impl Dataset { Ok(self.ds.manifest().data_storage_format.version.clone()) } + /// Get the current storage options. + /// + /// Returns a dictionary combining the initial storage_options with any + /// overrides from the storage_options_provider. Provider options take + /// precedence when keys conflict. Options are cached until they expire. + fn current_storage_options(&self) -> PyResult> { + rt().block_on(None, self.ds.current_storage_options())? + .map_err(|err| PyIOError::new_err(err.to_string())) + } + /// Get index statistics fn index_statistics(&self, index_name: String) -> PyResult { rt().block_on(None, self.ds.index_statistics(&index_name))? @@ -1479,7 +1489,9 @@ impl Dataset { // `version` can be a version number or a tag name. // `storage_options` will be forwarded to the object store params for the new dataset. let store_params = storage_options.as_ref().map(|opts| ObjectStoreParams { - storage_options: Some(opts.clone()), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::new_with_options( + opts.clone(), + ))), ..Default::default() }); @@ -1671,7 +1683,7 @@ impl Dataset { // Build Ref from python object let reference = self.transform_ref(py, reference)?; let store_params = storage_options.map(|opts| ObjectStoreParams { - storage_options: Some(opts), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::new_with_options(opts))), ..Default::default() }); let created = rt() @@ -2163,26 +2175,43 @@ impl Dataset { detached: Option, max_retries: Option, ) -> PyResult { - let provider = storage_options_provider.and_then(|py_obj| { - crate::storage_options::PyStorageOptionsProvider::new(py_obj) - .ok() - .map(|py_provider| { - Arc::new( - crate::storage_options::PyStorageOptionsProviderWrapper::new(py_provider), - ) as Arc - }) - }); + let storage_options_accessor = { + let py_provider = storage_options_provider.and_then(|py_obj| { + crate::storage_options::PyStorageOptionsProvider::new(py_obj) + .ok() + .map(|py_provider| { + Arc::new( + crate::storage_options::PyStorageOptionsProviderWrapper::new( + py_provider, + ), + ) as Arc + }) + }); - let object_store_params = if storage_options.is_some() || provider.is_some() { - Some(ObjectStoreParams { - storage_options: storage_options.clone(), - storage_options_provider: provider, - ..Default::default() - }) - } else { - None + match (storage_options.clone(), py_provider) { + (Some(opts), Some(provider)) => Some(Arc::new( + StorageOptionsAccessor::new_with_options_and_provider( + opts, + provider, + std::time::Duration::from_secs(60), + ), + )), + (Some(opts), None) => { + Some(Arc::new(StorageOptionsAccessor::new_with_options(opts))) + } + (None, Some(provider)) => Some(Arc::new(StorageOptionsAccessor::new_with_provider( + provider, + std::time::Duration::from_secs(60), + ))), + (None, None) => None, + } }; + let object_store_params = storage_options_accessor.map(|accessor| ObjectStoreParams { + storage_options_accessor: Some(accessor), + ..Default::default() + }); + let commit_handler = commit_lock .as_ref() .map(|commit_lock| { @@ -2232,26 +2261,43 @@ impl Dataset { detached: Option, max_retries: Option, ) -> PyResult<(Self, PyLance)> { - let provider = storage_options_provider.and_then(|py_obj| { - crate::storage_options::PyStorageOptionsProvider::new(py_obj) - .ok() - .map(|py_provider| { - Arc::new( - crate::storage_options::PyStorageOptionsProviderWrapper::new(py_provider), - ) as Arc - }) - }); + let storage_options_accessor = { + let py_provider = storage_options_provider.and_then(|py_obj| { + crate::storage_options::PyStorageOptionsProvider::new(py_obj) + .ok() + .map(|py_provider| { + Arc::new( + crate::storage_options::PyStorageOptionsProviderWrapper::new( + py_provider, + ), + ) as Arc + }) + }); - let object_store_params = if storage_options.is_some() || provider.is_some() { - Some(ObjectStoreParams { - storage_options: storage_options.clone(), - storage_options_provider: provider, - ..Default::default() - }) - } else { - None + match (storage_options.clone(), py_provider) { + (Some(opts), Some(provider)) => Some(Arc::new( + StorageOptionsAccessor::new_with_options_and_provider( + opts, + provider, + std::time::Duration::from_secs(60), + ), + )), + (Some(opts), None) => { + Some(Arc::new(StorageOptionsAccessor::new_with_options(opts))) + } + (None, Some(provider)) => Some(Arc::new(StorageOptionsAccessor::new_with_provider( + provider, + std::time::Duration::from_secs(60), + ))), + (None, None) => None, + } }; + let object_store_params = storage_options_accessor.map(|accessor| ObjectStoreParams { + storage_options_accessor: Some(accessor), + ..Default::default() + }); + let commit_handler = commit_lock .map(|commit_lock| { commit_lock @@ -3056,7 +3102,13 @@ pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult>(options, "storage_options")?; - let storage_options_provider = + let s3_credentials_refresh_offset_seconds = + get_dict_opt::(options, "s3_credentials_refresh_offset_seconds")?; + let s3_credentials_refresh_offset = s3_credentials_refresh_offset_seconds + .map(std::time::Duration::from_secs) + .unwrap_or(std::time::Duration::from_secs(60)); + + let py_provider = get_dict_opt::(options, "storage_options_provider")?.and_then(|py_obj| { crate::storage_options::PyStorageOptionsProvider::new(py_obj) .ok() @@ -3065,25 +3117,29 @@ pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult + ) as Arc }) }); - let s3_credentials_refresh_offset_seconds = - get_dict_opt::(options, "s3_credentials_refresh_offset_seconds")?; - - if storage_options.is_some() - || storage_options_provider.is_some() - || s3_credentials_refresh_offset_seconds.is_some() - { - let s3_credentials_refresh_offset = s3_credentials_refresh_offset_seconds - .map(std::time::Duration::from_secs) - .unwrap_or(std::time::Duration::from_secs(60)); + let storage_options_accessor = match (storage_options.clone(), py_provider) { + (Some(opts), Some(provider)) => Some(Arc::new( + StorageOptionsAccessor::new_with_options_and_provider( + opts, + provider, + s3_credentials_refresh_offset, + ), + )), + (Some(opts), None) => Some(Arc::new(StorageOptionsAccessor::new_with_options(opts))), + (None, Some(provider)) => Some(Arc::new(StorageOptionsAccessor::new_with_provider( + provider, + s3_credentials_refresh_offset, + ))), + (None, None) => None, + }; + if storage_options_accessor.is_some() || s3_credentials_refresh_offset_seconds.is_some() { p.store_params = Some(ObjectStoreParams { - storage_options, - storage_options_provider, + storage_options_accessor, s3_credentials_refresh_offset, ..Default::default() }); diff --git a/python/src/file.rs b/python/src/file.rs index 11971e5d5d7..4d7c8370240 100644 --- a/python/src/file.rs +++ b/python/src/file.rs @@ -28,7 +28,7 @@ use lance_file::reader::{ }; use lance_file::writer::{FileWriter, FileWriterOptions}; use lance_file::{version::LanceFileVersion, LanceEncodingsIo}; -use lance_io::object_store::ObjectStoreParams; +use lance_io::object_store::{ObjectStoreParams, StorageOptionsAccessor}; use lance_io::{ scheduler::{ScanScheduler, SchedulerConfig}, utils::CachedFileSize, @@ -391,15 +391,31 @@ pub async fn object_store_from_uri_or_path_with_provider( s3_credentials_refresh_offset_seconds: Option, ) -> PyResult<(Arc, Path)> { let object_store_registry = Arc::new(lance::io::ObjectStoreRegistry::default()); - let mut object_store_params = ObjectStoreParams { - storage_options: storage_options.clone(), - storage_options_provider, + let s3_credentials_refresh_offset = s3_credentials_refresh_offset_seconds + .map(std::time::Duration::from_secs) + .unwrap_or(std::time::Duration::from_secs(60)); + + let storage_options_accessor = match (storage_options.clone(), storage_options_provider) { + (Some(opts), Some(provider)) => Some(Arc::new( + StorageOptionsAccessor::new_with_options_and_provider( + opts, + provider, + s3_credentials_refresh_offset, + ), + )), + (Some(opts), None) => Some(Arc::new(StorageOptionsAccessor::new_with_options(opts))), + (None, Some(provider)) => Some(Arc::new(StorageOptionsAccessor::new_with_provider( + provider, + s3_credentials_refresh_offset, + ))), + (None, None) => None, + }; + + let object_store_params = ObjectStoreParams { + storage_options_accessor, + s3_credentials_refresh_offset, ..Default::default() }; - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - object_store_params.s3_credentials_refresh_offset = - std::time::Duration::from_secs(offset_seconds); - } let (object_store, path) = ObjectStore::from_uri_and_params( object_store_registry, diff --git a/rust/lance-io/src/object_store.rs b/rust/lance-io/src/object_store.rs index 4375a950d09..4b6694097c9 100644 --- a/rust/lance-io/src/object_store.rs +++ b/rust/lance-io/src/object_store.rs @@ -17,7 +17,6 @@ use deepsize::DeepSizeOf; use futures::{future, stream::BoxStream, StreamExt, TryStreamExt}; use futures::{FutureExt, Stream}; use lance_core::error::LanceOptionExt; -use lance_core::utils::parse::str_is_truthy; use list_retry::ListRetryStream; #[cfg(feature = "aws")] use object_store::aws::AwsCredentialProvider; @@ -64,7 +63,8 @@ pub const DEFAULT_DOWNLOAD_RETRY_COUNT: usize = 3; pub use providers::{ObjectStoreProvider, ObjectStoreRegistry}; pub use storage_options::{ - LanceNamespaceStorageOptionsProvider, StorageOptionsProvider, EXPIRES_AT_MILLIS_KEY, + LanceNamespaceStorageOptionsProvider, StorageOptionsAccessor, StorageOptionsProvider, + EXPIRES_AT_MILLIS_KEY, }; #[async_trait] @@ -187,9 +187,9 @@ pub struct ObjectStoreParams { #[cfg(feature = "aws")] pub aws_credentials: Option, pub object_store_wrapper: Option>, - pub storage_options: Option>, - /// Dynamic storage options provider for automatic credential refresh - pub storage_options_provider: Option>, + /// Unified accessor for storage options, supporting both static options and + /// dynamic providers with automatic caching and refresh. + pub storage_options_accessor: Option>, /// Use constant size upload parts for multipart uploads. Only necessary /// for Cloudflare R2, which doesn't support variable size parts. When this /// is false, max upload size is 2.5TB. When this is true, the max size is @@ -208,8 +208,7 @@ impl Default for ObjectStoreParams { #[cfg(feature = "aws")] aws_credentials: None, object_store_wrapper: None, - storage_options: None, - storage_options_provider: None, + storage_options_accessor: None, use_constant_size_upload_parts: false, list_is_lexically_ordered: None, } @@ -220,7 +219,7 @@ impl Default for ObjectStoreParams { impl std::hash::Hash for ObjectStoreParams { #[allow(deprecated)] fn hash(&self, state: &mut H) { - // For hashing, we use pointer values for ObjectStore, S3 credentials, wrapper, and storage options provider + // For hashing, we use pointer values for ObjectStore, S3 credentials, and wrapper self.block_size.hash(state); if let Some((store, url)) = &self.object_store { Arc::as_ptr(store).hash(state); @@ -234,14 +233,9 @@ impl std::hash::Hash for ObjectStoreParams { if let Some(wrapper) = &self.object_store_wrapper { Arc::as_ptr(wrapper).hash(state); } - if let Some(storage_options) = &self.storage_options { - for (key, value) in storage_options { - key.hash(state); - value.hash(state); - } - } - if let Some(provider) = &self.storage_options_provider { - provider.provider_id().hash(state); + // StorageOptionsAccessor implements Hash based on initial_options + provider_id + if let Some(accessor) = &self.storage_options_accessor { + accessor.hash(state); } self.use_constant_size_upload_parts.hash(state); self.list_is_lexically_ordered.hash(state); @@ -259,7 +253,7 @@ impl PartialEq for ObjectStoreParams { } // For equality, we use pointer comparison for ObjectStore, S3 credentials, wrapper - // For storage_options_provider, we use provider_id() for semantic equality + // StorageOptionsAccessor implements PartialEq based on initial_options + provider_id self.block_size == other.block_size && self .object_store @@ -272,20 +266,41 @@ impl PartialEq for ObjectStoreParams { && self.s3_credentials_refresh_offset == other.s3_credentials_refresh_offset && self.object_store_wrapper.as_ref().map(Arc::as_ptr) == other.object_store_wrapper.as_ref().map(Arc::as_ptr) - && self.storage_options == other.storage_options - && self - .storage_options_provider - .as_ref() - .map(|p| p.provider_id()) - == other - .storage_options_provider - .as_ref() - .map(|p| p.provider_id()) + && match ( + &self.storage_options_accessor, + &other.storage_options_accessor, + ) { + (Some(a), Some(b)) => a.as_ref() == b.as_ref(), + (None, None) => true, + _ => false, + } && self.use_constant_size_upload_parts == other.use_constant_size_upload_parts && self.list_is_lexically_ordered == other.list_is_lexically_ordered } } +impl ObjectStoreParams { + /// Get the storage options accessor, creating a default one if none is set. + /// + /// This is the preferred way to access storage options. Use the accessor's + /// async methods to get merged/refreshed options. + pub fn accessor(&self) -> Arc { + self.storage_options_accessor + .clone() + .unwrap_or_else(|| Arc::new(StorageOptionsAccessor::new_with_options(HashMap::new()))) + } + + /// Get a reference to the initial storage options (synchronous, no provider refresh). + /// + /// This is primarily used for synchronous operations that need to access + /// storage options without going through the async provider refresh mechanism. + pub fn storage_options_ref(&self) -> Option<&HashMap> { + self.storage_options_accessor + .as_ref() + .and_then(|a| a.initial_options()) + } +} + /// Convert a URI string or local path to a URL /// /// This function handles both proper URIs (with schemes like `file://`, `s3://`, etc.) @@ -410,7 +425,7 @@ impl ObjectStore { if let Some((store, path)) = params.object_store.as_ref() { let mut inner = store.clone(); let store_prefix = - registry.calculate_object_store_prefix(uri, params.storage_options.as_ref())?; + registry.calculate_object_store_prefix(uri, params.storage_options_ref())?; if let Some(wrapper) = params.object_store_wrapper.as_ref() { inner = wrapper.wrap(&store_prefix, inner); } @@ -763,83 +778,6 @@ impl FromStr for LanceConfigKey { } } -#[derive(Clone, Debug, Default)] -pub struct StorageOptions(pub HashMap); - -impl StorageOptions { - /// Create a new instance of [`StorageOptions`] - pub fn new(options: HashMap) -> Self { - let mut options = options; - if let Ok(value) = std::env::var("AZURE_STORAGE_ALLOW_HTTP") { - options.insert("allow_http".into(), value); - } - if let Ok(value) = std::env::var("AZURE_STORAGE_USE_HTTP") { - options.insert("allow_http".into(), value); - } - if let Ok(value) = std::env::var("AWS_ALLOW_HTTP") { - options.insert("allow_http".into(), value); - } - if let Ok(value) = std::env::var("OBJECT_STORE_CLIENT_MAX_RETRIES") { - options.insert("client_max_retries".into(), value); - } - if let Ok(value) = std::env::var("OBJECT_STORE_CLIENT_RETRY_TIMEOUT") { - options.insert("client_retry_timeout".into(), value); - } - Self(options) - } - - /// Denotes if unsecure connections via http are allowed - pub fn allow_http(&self) -> bool { - self.0.iter().any(|(key, value)| { - key.to_ascii_lowercase().contains("allow_http") & str_is_truthy(value) - }) - } - - /// Number of times to retry a download that fails - pub fn download_retry_count(&self) -> usize { - self.0 - .iter() - .find(|(key, _)| key.eq_ignore_ascii_case("download_retry_count")) - .map(|(_, value)| value.parse::().unwrap_or(3)) - .unwrap_or(3) - } - - /// Max retry times to set in RetryConfig for object store client - pub fn client_max_retries(&self) -> usize { - self.0 - .iter() - .find(|(key, _)| key.eq_ignore_ascii_case("client_max_retries")) - .and_then(|(_, value)| value.parse::().ok()) - .unwrap_or(10) - } - - /// Seconds of timeout to set in RetryConfig for object store client - pub fn client_retry_timeout(&self) -> u64 { - self.0 - .iter() - .find(|(key, _)| key.eq_ignore_ascii_case("client_retry_timeout")) - .and_then(|(_, value)| value.parse::().ok()) - .unwrap_or(180) - } - - pub fn get(&self, key: &str) -> Option<&String> { - self.0.get(key) - } - - /// Get the expiration time in milliseconds since epoch, if present - pub fn expires_at_millis(&self) -> Option { - self.0 - .get(EXPIRES_AT_MILLIS_KEY) - .and_then(|s| s.parse::().ok()) - } -} - -impl From> for StorageOptions { - fn from(value: HashMap) -> Self { - Self::new(value) - } -} - static DEFAULT_OBJECT_STORE_REGISTRY: std::sync::LazyLock = std::sync::LazyLock::new(ObjectStoreRegistry::default); @@ -975,7 +913,9 @@ mod tests { // Test the default let registry = Arc::new(ObjectStoreRegistry::default()); let params = ObjectStoreParams { - storage_options: storage_options.clone(), + storage_options_accessor: storage_options + .clone() + .map(|opts| Arc::new(StorageOptionsAccessor::new_with_options(opts))), ..ObjectStoreParams::default() }; let (store, _) = ObjectStore::from_uri_and_params(registry, uri, ¶ms) @@ -987,7 +927,8 @@ mod tests { let registry = Arc::new(ObjectStoreRegistry::default()); let params = ObjectStoreParams { block_size: Some(1024), - storage_options: storage_options.clone(), + storage_options_accessor: storage_options + .map(|opts| Arc::new(StorageOptionsAccessor::new_with_options(opts))), ..ObjectStoreParams::default() }; let (store, _) = ObjectStore::from_uri_and_params(registry, uri, ¶ms) diff --git a/rust/lance-io/src/object_store/providers.rs b/rust/lance-io/src/object_store/providers.rs index 17cbb3900d2..319375f186e 100644 --- a/rust/lance-io/src/object_store/providers.rs +++ b/rust/lance-io/src/object_store/providers.rs @@ -172,7 +172,7 @@ impl ObjectStoreRegistry { }; let cache_path = - provider.calculate_object_store_prefix(&base_path, params.storage_options.as_ref())?; + provider.calculate_object_store_prefix(&base_path, params.storage_options_ref())?; let cache_key = (cache_path.clone(), params.clone()); // Check if we have a cached store for this base path and params diff --git a/rust/lance-io/src/object_store/providers/aws.rs b/rust/lance-io/src/object_store/providers/aws.rs index d5648d67c19..aacedadf34d 100644 --- a/rust/lance-io/src/object_store/providers/aws.rs +++ b/rust/lance-io/src/object_store/providers/aws.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::{collections::HashMap, fmt, str::FromStr, sync::Arc, time::Duration}; +use std::{collections::HashMap, fmt, sync::Arc, time::Duration}; #[cfg(test)] use mock_instant::thread_local::{SystemTime, UNIX_EPOCH}; @@ -28,8 +28,9 @@ use tokio::sync::RwLock; use url::Url; use crate::object_store::{ - ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptions, StorageOptionsProvider, - DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, + ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptionsAccessor, + StorageOptionsProvider, DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, + DEFAULT_MAX_IOP_SIZE, EXPIRES_AT_MILLIS_KEY, }; use lance_core::error::{Error, Result}; @@ -41,26 +42,31 @@ impl AwsStoreProvider { &self, base_path: &mut Url, params: &ObjectStoreParams, - storage_options: &StorageOptions, + accessor: &Arc, + storage_options: &HashMap, is_s3_express: bool, ) -> Result> { - let max_retries = storage_options.client_max_retries(); - let retry_timeout = storage_options.client_retry_timeout(); + let max_retries = accessor.client_max_retries().await?; + let retry_timeout = accessor.client_retry_timeout().await?; let retry_config = RetryConfig { backoff: Default::default(), max_retries, retry_timeout: Duration::from_secs(retry_timeout), }; - let mut s3_storage_options = storage_options.as_s3_options(); + let mut s3_storage_options = StorageOptionsAccessor::as_s3_options(storage_options); let region = resolve_s3_region(base_path, &s3_storage_options).await?; + let storage_options_provider = accessor.provider(); + let expires_at_millis = storage_options + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()); let (aws_creds, region) = build_aws_credential( params.s3_credentials_refresh_offset, params.aws_credentials.clone(), Some(&s3_storage_options), region, - params.storage_options_provider.clone(), - storage_options.expires_at_millis(), + storage_options_provider, + expires_at_millis, ) .await?; @@ -90,7 +96,7 @@ impl AwsStoreProvider { async fn build_opendal_s3_store( &self, base_path: &Url, - storage_options: &StorageOptions, + storage_options: &HashMap, ) -> Result> { let bucket = base_path .host_str() @@ -101,7 +107,7 @@ impl AwsStoreProvider { // Start with all storage options as the config map // OpenDAL will handle environment variables through its default credentials chain - let mut config_map: HashMap = storage_options.0.clone(); + let mut config_map: HashMap = storage_options.clone(); // Set required OpenDAL configuration config_map.insert("bucket".to_string(), bucket); @@ -131,25 +137,17 @@ impl ObjectStoreProvider for AwsStoreProvider { params: &ObjectStoreParams, ) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); - let mut storage_options = - StorageOptions(params.storage_options.clone().unwrap_or_default()); - storage_options.with_env_s3(); - let download_retry_count = storage_options.download_retry_count(); - - let use_opendal = storage_options - .0 - .get("use_opendal") - .map(|v| v == "true") - .unwrap_or(false); - - // Determine S3 Express and constant size upload parts before building the store - let is_s3_express = check_s3_express(&base_path, &storage_options); - - let use_constant_size_upload_parts = storage_options - .0 - .get("aws_endpoint") - .map(|endpoint| endpoint.contains("r2.cloudflarestorage.com")) - .unwrap_or(false); + let accessor = params.accessor(); + + // Get merged storage options with env vars applied + let storage_options = accessor.get_options_with_s3_env().await?; + let download_retry_count = accessor.download_retry_count().await?; + let use_opendal = accessor.s3_use_opendal().await?; + let is_s3_express_opt = accessor.s3_express().await?; + let use_constant_size_upload_parts = accessor.is_cloudflare_r2().await?; + + // S3 Express can also be detected from URL + let is_s3_express = is_s3_express_opt || base_path.authority().ends_with("--x-s3"); let inner = if use_opendal { // Use OpenDAL implementation @@ -157,8 +155,14 @@ impl ObjectStoreProvider for AwsStoreProvider { .await? } else { // Use default Amazon S3 implementation - self.build_amazon_s3_store(&mut base_path, params, &storage_options, is_s3_express) - .await? + self.build_amazon_s3_store( + &mut base_path, + params, + &accessor, + &storage_options, + is_s3_express, + ) + .await? }; Ok(ObjectStore { @@ -175,10 +179,10 @@ impl ObjectStoreProvider for AwsStoreProvider { } } -/// Check if the storage is S3 Express -fn check_s3_express(url: &Url, storage_options: &StorageOptions) -> bool { +/// Check if the storage is S3 Express (for tests with HashMap) +#[cfg(test)] +fn check_s3_express(url: &Url, storage_options: &HashMap) -> bool { storage_options - .0 .get("s3_express") .map(|v| v == "true") .unwrap_or(false) @@ -433,33 +437,6 @@ impl CredentialProvider for AwsCredentialAdapter { } } -impl StorageOptions { - /// Add values from the environment to storage options - pub fn with_env_s3(&mut self) { - for (os_key, os_value) in std::env::vars_os() { - if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { - if let Ok(config_key) = AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()) { - if !self.0.contains_key(config_key.as_ref()) { - self.0 - .insert(config_key.as_ref().to_string(), value.to_string()); - } - } - } - } - } - - /// Subset of options relevant for s3 storage - pub fn as_s3_options(&self) -> HashMap { - self.0 - .iter() - .filter_map(|(key, value)| { - let s3_key = AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()).ok()?; - Some((s3_key, value.clone())) - }) - .collect() - } -} - impl ObjectStoreParams { /// Create a new instance of [`ObjectStoreParams`] based on the AWS credentials. pub fn with_aws_credentials( @@ -468,8 +445,11 @@ impl ObjectStoreParams { ) -> Self { Self { aws_credentials, - storage_options: region - .map(|region| [("region".into(), region)].iter().cloned().collect()), + storage_options_accessor: region.map(|region| { + Arc::new(StorageOptionsAccessor::new_with_options( + [("region".into(), region)].iter().cloned().collect(), + )) + }), ..Default::default() } } @@ -610,9 +590,10 @@ impl DynamicStorageOptionsCredentialProvider { source: "No storage options available".into(), })?; - let storage_options = StorageOptions(storage_options_map); - let expires_at_millis = storage_options.expires_at_millis(); - let s3_options = storage_options.as_s3_options(); + let expires_at_millis = storage_options_map + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()); + let s3_options = StorageOptionsAccessor::as_s3_options(&storage_options_map); let static_creds = extract_static_s3_credentials(&s3_options).ok_or_else(|| { object_store::Error::Generic { store: "DynamicStorageOptionsCredentialProvider", @@ -785,8 +766,7 @@ mod tests { for (uri, storage_map, expected) in cases { let url = Url::parse(uri).unwrap(); - let storage_options = StorageOptions(storage_map); - let is_s3_express = check_s3_express(&url, &storage_options); + let is_s3_express = check_s3_express(&url, &storage_map); assert_eq!(is_s3_express, expected); } } @@ -796,10 +776,12 @@ mod tests { let provider = AwsStoreProvider; let url = Url::parse("s3://test-bucket/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options: Some(HashMap::from([ - ("use_opendal".to_string(), "true".to_string()), - ("region".to_string(), "us-west-2".to_string()), - ])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::new_with_options( + HashMap::from([ + ("use_opendal".to_string(), "true".to_string()), + ("region".to_string(), "us-west-2".to_string()), + ]), + ))), ..Default::default() }; diff --git a/rust/lance-io/src/object_store/providers/azure.rs b/rust/lance-io/src/object_store/providers/azure.rs index 7a90fc6744a..696be85e0d1 100644 --- a/rust/lance-io/src/object_store/providers/azure.rs +++ b/rust/lance-io/src/object_store/providers/azure.rs @@ -3,7 +3,6 @@ use std::{ collections::HashMap, - str::FromStr, sync::{Arc, LazyLock}, time::Duration, }; @@ -13,15 +12,12 @@ use object_store_opendal::OpendalStore; use opendal::{services::Azblob, Operator}; use snafu::location; -use object_store::{ - azure::{AzureConfigKey, MicrosoftAzureBuilder}, - RetryConfig, -}; +use object_store::{azure::MicrosoftAzureBuilder, RetryConfig}; use url::Url; use crate::object_store::{ - ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptions, DEFAULT_CLOUD_BLOCK_SIZE, - DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, + ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptionsAccessor, + DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, }; use lance_core::error::{Error, Result}; @@ -32,7 +28,7 @@ impl AzureBlobStoreProvider { async fn build_opendal_azure_store( &self, base_path: &Url, - storage_options: &StorageOptions, + storage_options: &HashMap, ) -> Result> { let container = base_path .host_str() @@ -45,7 +41,7 @@ impl AzureBlobStoreProvider { // Start with all storage options as the config map // OpenDAL will handle environment variables through its default credentials chain - let mut config_map: HashMap = storage_options.0.clone(); + let mut config_map: HashMap = storage_options.clone(); // Set required OpenDAL configuration config_map.insert("container".to_string(), container); @@ -69,10 +65,11 @@ impl AzureBlobStoreProvider { async fn build_microsoft_azure_store( &self, base_path: &Url, - storage_options: &StorageOptions, + accessor: &Arc, + storage_options: &HashMap, ) -> Result> { - let max_retries = storage_options.client_max_retries(); - let retry_timeout = storage_options.client_retry_timeout(); + let max_retries = accessor.client_max_retries().await?; + let retry_timeout = accessor.client_retry_timeout().await?; let retry_config = RetryConfig { backoff: Default::default(), max_retries, @@ -82,7 +79,7 @@ impl AzureBlobStoreProvider { let mut builder = MicrosoftAzureBuilder::new() .with_url(base_path.as_ref()) .with_retry(retry_config); - for (key, value) in storage_options.as_azure_options() { + for (key, value) in StorageOptionsAccessor::as_azure_options(storage_options) { builder = builder.with_config(key, value); } @@ -94,22 +91,18 @@ impl AzureBlobStoreProvider { impl ObjectStoreProvider for AzureBlobStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); - let mut storage_options = - StorageOptions(params.storage_options.clone().unwrap_or_default()); - storage_options.with_env_azure(); - let download_retry_count = storage_options.download_retry_count(); + let accessor = params.accessor(); - let use_opendal = storage_options - .0 - .get("use_opendal") - .map(|v| v.as_str() == "true") - .unwrap_or(false); + // Get merged storage options with Azure env vars applied + let storage_options = accessor.get_options_with_azure_env().await?; + let download_retry_count = accessor.download_retry_count().await?; + let use_opendal = accessor.azure_use_opendal().await?; let inner = if use_opendal { self.build_opendal_azure_store(&base_path, &storage_options) .await? } else { - self.build_microsoft_azure_store(&base_path, &storage_options) + self.build_microsoft_azure_store(&base_path, &accessor, &storage_options) .await? }; @@ -147,11 +140,11 @@ impl ObjectStoreProvider for AzureBlobStoreProvider { // The URI looks like 'az://container/path-part/file'. // We must look at the storage options to find the account. let mut account = match storage_options { - Some(opts) => StorageOptions::find_configured_storage_account(opts), + Some(opts) => StorageOptionsAccessor::find_configured_storage_account(opts), None => None, }; if account.is_none() { - account = StorageOptions::find_configured_storage_account(&ENV_OPTIONS.0); + account = StorageOptionsAccessor::find_configured_storage_account(&ENV_OPTIONS); } let account = account.ok_or(Error::invalid_input( "Unable to find object store prefix: no Azure account name in URI, and no storage account configured.", @@ -164,59 +157,15 @@ impl ObjectStoreProvider for AzureBlobStoreProvider { } } -static ENV_OPTIONS: LazyLock = LazyLock::new(StorageOptions::from_env); - -impl StorageOptions { - /// Iterate over all environment variables, looking for anything related to Azure. - fn from_env() -> Self { - let mut opts = HashMap::::new(); - for (os_key, os_value) in std::env::vars_os() { - if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { - if let Ok(config_key) = AzureConfigKey::from_str(&key.to_ascii_lowercase()) { - opts.insert(config_key.as_ref().to_string(), value.to_string()); - } - } - } - Self(opts) - } - - /// Add values from the environment to storage options - pub fn with_env_azure(&mut self) { - for (os_key, os_value) in &ENV_OPTIONS.0 { - if !self.0.contains_key(os_key) { - self.0.insert(os_key.clone(), os_value.clone()); - } - } - } - - /// Subset of options relevant for azure storage - pub fn as_azure_options(&self) -> HashMap { - self.0 - .iter() - .filter_map(|(key, value)| { - let az_key = AzureConfigKey::from_str(&key.to_ascii_lowercase()).ok()?; - Some((az_key, value.clone())) - }) - .collect() - } - - #[allow(clippy::manual_map)] - fn find_configured_storage_account(map: &HashMap) -> Option { - if let Some(account) = map.get("azure_storage_account_name") { - Some(account.clone()) - } else if let Some(account) = map.get("account_name") { - Some(account.clone()) - } else { - None - } - } -} +static ENV_OPTIONS: LazyLock> = + LazyLock::new(StorageOptionsAccessor::from_env_azure); #[cfg(test)] mod tests { use super::*; use crate::object_store::ObjectStoreParams; use std::collections::HashMap; + use std::sync::Arc; #[test] fn test_azure_store_path() { @@ -233,18 +182,20 @@ mod tests { let provider = AzureBlobStoreProvider; let url = Url::parse("az://test-container/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options: Some(HashMap::from([ - ("use_opendal".to_string(), "true".to_string()), - ("account_name".to_string(), "test_account".to_string()), - ( - "endpoint".to_string(), - "https://test_account.blob.core.windows.net".to_string(), - ), - ( - "account_key".to_string(), - "dGVzdF9hY2NvdW50X2tleQ==".to_string(), - ), - ])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::new_with_options( + HashMap::from([ + ("use_opendal".to_string(), "true".to_string()), + ("account_name".to_string(), "test_account".to_string()), + ( + "endpoint".to_string(), + "https://test_account.blob.core.windows.net".to_string(), + ), + ( + "account_key".to_string(), + "dGVzdF9hY2NvdW50X2tleQ==".to_string(), + ), + ]), + ))), ..Default::default() }; @@ -259,7 +210,7 @@ mod tests { fn test_find_configured_storage_account() { assert_eq!( Some("myaccount".to_string()), - StorageOptions::find_configured_storage_account(&HashMap::from_iter( + StorageOptionsAccessor::find_configured_storage_account(&HashMap::from_iter( [ ("access_key".to_string(), "myaccesskey".to_string()), ( diff --git a/rust/lance-io/src/object_store/providers/gcp.rs b/rust/lance-io/src/object_store/providers/gcp.rs index 038015d7f4e..9c20112909a 100644 --- a/rust/lance-io/src/object_store/providers/gcp.rs +++ b/rust/lance-io/src/object_store/providers/gcp.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration}; +use std::{collections::HashMap, sync::Arc, time::Duration}; use object_store::ObjectStore as OSObjectStore; use object_store_opendal::OpendalStore; @@ -9,14 +9,14 @@ use opendal::{services::Gcs, Operator}; use snafu::location; use object_store::{ - gcp::{GcpCredential, GoogleCloudStorageBuilder, GoogleConfigKey}, + gcp::{GcpCredential, GoogleCloudStorageBuilder}, RetryConfig, StaticCredentialProvider, }; use url::Url; use crate::object_store::{ - ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptions, DEFAULT_CLOUD_BLOCK_SIZE, - DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, + ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptionsAccessor, + DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, }; use lance_core::error::{Error, Result}; @@ -27,7 +27,7 @@ impl GcsStoreProvider { async fn build_opendal_gcs_store( &self, base_path: &Url, - storage_options: &StorageOptions, + storage_options: &HashMap, ) -> Result> { let bucket = base_path .host_str() @@ -38,7 +38,7 @@ impl GcsStoreProvider { // Start with all storage options as the config map // OpenDAL will handle environment variables through its default credentials chain - let mut config_map: HashMap = storage_options.0.clone(); + let mut config_map: HashMap = storage_options.clone(); // Set required OpenDAL configuration config_map.insert("bucket".to_string(), bucket); @@ -62,10 +62,11 @@ impl GcsStoreProvider { async fn build_google_cloud_store( &self, base_path: &Url, - storage_options: &StorageOptions, + accessor: &Arc, + storage_options: &HashMap, ) -> Result> { - let max_retries = storage_options.client_max_retries(); - let retry_timeout = storage_options.client_retry_timeout(); + let max_retries = accessor.client_max_retries().await?; + let retry_timeout = accessor.client_retry_timeout().await?; let retry_config = RetryConfig { backoff: Default::default(), max_retries, @@ -75,14 +76,12 @@ impl GcsStoreProvider { let mut builder = GoogleCloudStorageBuilder::new() .with_url(base_path.as_ref()) .with_retry(retry_config); - for (key, value) in storage_options.as_gcs_options() { + for (key, value) in StorageOptionsAccessor::as_gcs_options(storage_options) { builder = builder.with_config(key, value); } - let token_key = "google_storage_token"; - if let Some(storage_token) = storage_options.get(token_key) { - let credential = GcpCredential { - bearer: storage_token.clone(), - }; + let storage_token = accessor.google_storage_token().await?; + if let Some(token) = storage_token { + let credential = GcpCredential { bearer: token }; let credential_provider = Arc::new(StaticCredentialProvider::new(credential)) as _; builder = builder.with_credentials(credential_provider); } @@ -95,22 +94,18 @@ impl GcsStoreProvider { impl ObjectStoreProvider for GcsStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); - let mut storage_options = - StorageOptions(params.storage_options.clone().unwrap_or_default()); - storage_options.with_env_gcs(); - let download_retry_count = storage_options.download_retry_count(); + let accessor = params.accessor(); - let use_opendal = storage_options - .0 - .get("use_opendal") - .map(|v| v.as_str() == "true") - .unwrap_or(false); + // Get merged storage options with GCS env vars applied + let storage_options = accessor.get_options_with_gcs_env().await?; + let download_retry_count = accessor.download_retry_count().await?; + let use_opendal = accessor.gcs_use_opendal().await?; let inner = if use_opendal { self.build_opendal_gcs_store(&base_path, &storage_options) .await? } else { - self.build_google_cloud_store(&base_path, &storage_options) + self.build_google_cloud_store(&base_path, &accessor, &storage_options) .await? }; @@ -128,45 +123,12 @@ impl ObjectStoreProvider for GcsStoreProvider { } } -impl StorageOptions { - /// Add values from the environment to storage options - pub fn with_env_gcs(&mut self) { - for (os_key, os_value) in std::env::vars_os() { - if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { - let lowercase_key = key.to_ascii_lowercase(); - let token_key = "google_storage_token"; - - if let Ok(config_key) = GoogleConfigKey::from_str(&lowercase_key) { - if !self.0.contains_key(config_key.as_ref()) { - self.0 - .insert(config_key.as_ref().to_string(), value.to_string()); - } - } - // Check for GOOGLE_STORAGE_TOKEN until GoogleConfigKey supports storage token - else if lowercase_key == token_key && !self.0.contains_key(token_key) { - self.0.insert(token_key.to_string(), value.to_string()); - } - } - } - } - - /// Subset of options relevant for gcs storage - pub fn as_gcs_options(&self) -> HashMap { - self.0 - .iter() - .filter_map(|(key, value)| { - let gcs_key = GoogleConfigKey::from_str(&key.to_ascii_lowercase()).ok()?; - Some((gcs_key, value.clone())) - }) - .collect() - } -} - #[cfg(test)] mod tests { use super::*; use crate::object_store::ObjectStoreParams; use std::collections::HashMap; + use std::sync::Arc; #[test] fn test_gcs_store_path() { @@ -183,13 +145,15 @@ mod tests { let provider = GcsStoreProvider; let url = Url::parse("gs://test-bucket/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options: Some(HashMap::from([ - ("use_opendal".to_string(), "true".to_string()), - ( - "service_account".to_string(), - "test@example.iam.gserviceaccount.com".to_string(), - ), - ])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::new_with_options( + HashMap::from([ + ("use_opendal".to_string(), "true".to_string()), + ( + "service_account".to_string(), + "test@example.iam.gserviceaccount.com".to_string(), + ), + ]), + ))), ..Default::default() }; diff --git a/rust/lance-io/src/object_store/providers/huggingface.rs b/rust/lance-io/src/object_store/providers/huggingface.rs index c52c85a3c72..833783f8541 100644 --- a/rust/lance-io/src/object_store/providers/huggingface.rs +++ b/rust/lance-io/src/object_store/providers/huggingface.rs @@ -13,7 +13,7 @@ use url::Url; use crate::object_store::parse_hf_repo_id; use crate::object_store::{ - ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptions, DEFAULT_CLOUD_BLOCK_SIZE, + ObjectStore, ObjectStoreParams, ObjectStoreProvider, DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, }; use lance_core::error::{Error, Result}; @@ -65,8 +65,8 @@ impl ObjectStoreProvider for HuggingfaceStoreProvider { } = parse_hf_url(&base_path)?; let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); - let download_retry_count = storage_options.download_retry_count(); + let accessor = params.accessor(); + let download_retry_count = accessor.download_retry_count().await?; // Build OpenDAL config with allowed keys only. let mut config_map: HashMap = HashMap::new(); @@ -74,22 +74,17 @@ impl ObjectStoreProvider for HuggingfaceStoreProvider { config_map.insert("repo_type".to_string(), repo_type); config_map.insert("repo_id".to_string(), repo_id); - if let Some(rev) = storage_options.get("hf_revision").cloned() { + if let Some(rev) = accessor.hf_revision().await? { config_map.insert("revision".to_string(), rev); } - if let Some(root) = storage_options.get("hf_root").cloned() { + if let Some(root) = accessor.hf_root().await? { if !root.is_empty() { config_map.insert("root".to_string(), root); } } - if let Some(token) = storage_options - .get("hf_token") - .cloned() - .or_else(|| std::env::var("HF_TOKEN").ok()) - .or_else(|| std::env::var("HUGGINGFACE_TOKEN").ok()) - { + if let Some(token) = accessor.hf_token().await? { config_map.insert("token".to_string(), token); } @@ -140,6 +135,7 @@ impl ObjectStoreProvider for HuggingfaceStoreProvider { #[cfg(test)] mod tests { use super::*; + use crate::object_store::StorageOptionsAccessor; #[test] fn parse_basic_url() { @@ -155,14 +151,13 @@ mod tests { ); } - #[test] - fn storage_option_revision_takes_precedence() { + #[tokio::test] + async fn storage_option_revision_takes_precedence() { let url = Url::parse("hf://datasets/acme/repo/data/file").unwrap(); let params = ObjectStoreParams { - storage_options: Some(HashMap::from([( - String::from("hf_revision"), - String::from("stable"), - )])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::new_with_options( + HashMap::from([(String::from("hf_revision"), String::from("stable"))]), + ))), ..Default::default() }; // new_store should accept without creating operator; test precedence via builder config @@ -174,13 +169,8 @@ mod tests { let mut config_map: HashMap = HashMap::new(); config_map.insert("repo_type".to_string(), repo_type); config_map.insert("repo".to_string(), repo_id); - if let Some(rev) = params - .storage_options - .as_ref() - .unwrap() - .get("hf_revision") - .cloned() - { + let accessor = params.accessor(); + if let Some(rev) = accessor.hf_revision().await.unwrap() { config_map.insert("revision".to_string(), rev); } assert_eq!(config_map.get("revision").unwrap(), "stable"); diff --git a/rust/lance-io/src/object_store/providers/local.rs b/rust/lance-io/src/object_store/providers/local.rs index 74f2777992b..05b1d7ef91e 100644 --- a/rust/lance-io/src/object_store/providers/local.rs +++ b/rust/lance-io/src/object_store/providers/local.rs @@ -4,7 +4,7 @@ use std::{collections::HashMap, sync::Arc}; use crate::object_store::{ - ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptions, DEFAULT_LOCAL_BLOCK_SIZE, + ObjectStore, ObjectStoreParams, ObjectStoreProvider, DEFAULT_LOCAL_BLOCK_SIZE, DEFAULT_LOCAL_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, }; use lance_core::error::Result; @@ -20,8 +20,8 @@ pub struct FileStoreProvider; impl ObjectStoreProvider for FileStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_LOCAL_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); - let download_retry_count = storage_options.download_retry_count(); + let accessor = params.accessor(); + let download_retry_count = accessor.download_retry_count().await?; Ok(ObjectStore { inner: Arc::new(LocalFileSystem::new()), scheme: base_path.scheme().to_owned(), diff --git a/rust/lance-io/src/object_store/providers/memory.rs b/rust/lance-io/src/object_store/providers/memory.rs index 9519806ed70..4a0f53558b1 100644 --- a/rust/lance-io/src/object_store/providers/memory.rs +++ b/rust/lance-io/src/object_store/providers/memory.rs @@ -4,8 +4,8 @@ use std::{collections::HashMap, sync::Arc}; use crate::object_store::{ - ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptions, - DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_LOCAL_BLOCK_SIZE, DEFAULT_MAX_IOP_SIZE, + ObjectStore, ObjectStoreParams, ObjectStoreProvider, DEFAULT_CLOUD_IO_PARALLELISM, + DEFAULT_LOCAL_BLOCK_SIZE, DEFAULT_MAX_IOP_SIZE, }; use lance_core::error::Result; use object_store::{memory::InMemory, path::Path}; @@ -19,8 +19,8 @@ pub struct MemoryStoreProvider; impl ObjectStoreProvider for MemoryStoreProvider { async fn new_store(&self, _base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_LOCAL_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); - let download_retry_count = storage_options.download_retry_count(); + let accessor = params.accessor(); + let download_retry_count = accessor.download_retry_count().await?; Ok(ObjectStore { inner: Arc::new(InMemory::new()), scheme: String::from("memory"), diff --git a/rust/lance-io/src/object_store/providers/oss.rs b/rust/lance-io/src/object_store/providers/oss.rs index 3437ec8d1b6..a8ad311ef5c 100644 --- a/rust/lance-io/src/object_store/providers/oss.rs +++ b/rust/lance-io/src/object_store/providers/oss.rs @@ -10,7 +10,7 @@ use snafu::location; use url::Url; use crate::object_store::{ - ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptions, DEFAULT_CLOUD_BLOCK_SIZE, + ObjectStore, ObjectStoreParams, ObjectStoreProvider, DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, }; use lance_core::error::{Error, Result}; @@ -22,7 +22,8 @@ pub struct OssStoreProvider; impl ObjectStoreProvider for OssStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); + let accessor = params.accessor(); + let download_retry_count = accessor.download_retry_count().await?; let bucket = base_path .host_str() @@ -53,21 +54,21 @@ impl ObjectStoreProvider for OssStoreProvider { config_map.insert("root".to_string(), "/".to_string()); } - // Override with storage options if provided - if let Some(endpoint) = storage_options.0.get("oss_endpoint") { - config_map.insert("endpoint".to_string(), endpoint.clone()); + // Override with storage options if provided (using accessor async methods) + if let Some(endpoint) = accessor.oss_endpoint().await? { + config_map.insert("endpoint".to_string(), endpoint); } - if let Some(access_key_id) = storage_options.0.get("oss_access_key_id") { - config_map.insert("access_key_id".to_string(), access_key_id.clone()); + if let Some(access_key_id) = accessor.oss_access_key_id().await? { + config_map.insert("access_key_id".to_string(), access_key_id); } - if let Some(secret_access_key) = storage_options.0.get("oss_secret_access_key") { - config_map.insert("access_key_secret".to_string(), secret_access_key.clone()); + if let Some(secret_access_key) = accessor.oss_secret_access_key().await? { + config_map.insert("access_key_secret".to_string(), secret_access_key); } - if let Some(region) = storage_options.0.get("oss_region") { - config_map.insert("region".to_string(), region.clone()); + if let Some(region) = accessor.oss_region().await? { + config_map.insert("region".to_string(), region); } if !config_map.contains_key("endpoint") { @@ -101,7 +102,7 @@ impl ObjectStoreProvider for OssStoreProvider { use_constant_size_upload_parts: params.use_constant_size_upload_parts, list_is_lexically_ordered: params.list_is_lexically_ordered.unwrap_or(true), io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, - download_retry_count: storage_options.download_retry_count(), + download_retry_count, io_tracker: Default::default(), }) } diff --git a/rust/lance-io/src/object_store/storage_options.rs b/rust/lance-io/src/object_store/storage_options.rs index f809df8d1d3..f5c165587dd 100644 --- a/rust/lance-io/src/object_store/storage_options.rs +++ b/rust/lance-io/src/object_store/storage_options.rs @@ -9,13 +9,44 @@ use std::collections::HashMap; use std::fmt; +use std::str::FromStr; use std::sync::Arc; +use std::time::Duration; + +#[cfg(test)] +use mock_instant::thread_local::MockClock; +#[cfg(not(test))] +use std::time::{SystemTime, UNIX_EPOCH}; + +#[cfg(feature = "aws")] +use object_store::aws::AmazonS3ConfigKey; +#[cfg(feature = "azure")] +use object_store::azure::AzureConfigKey; +#[cfg(feature = "gcp")] +use object_store::gcp::GoogleConfigKey; use crate::{Error, Result}; + +/// Get the current time in milliseconds since UNIX epoch. +/// Uses MockClock in test mode for predictable time-based tests. +#[cfg(test)] +fn current_time_millis() -> u64 { + MockClock::system_time().as_millis() as u64 +} + +/// Get the current time in milliseconds since UNIX epoch. +#[cfg(not(test))] +fn current_time_millis() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or(Duration::from_secs(0)) + .as_millis() as u64 +} use async_trait::async_trait; use lance_namespace::models::DescribeTableRequest; use lance_namespace::LanceNamespace; use snafu::location; +use tokio::sync::RwLock; /// Key for the expiration timestamp in storage options HashMap pub const EXPIRES_AT_MILLIS_KEY: &str = "expires_at_millis"; @@ -140,3 +171,1146 @@ impl StorageOptionsProvider for LanceNamespaceStorageOptionsProvider { ) } } + +/// Cached storage options from the provider with expiration tracking +#[derive(Debug, Clone)] +struct CachedProviderOptions { + options: HashMap, + expires_at_millis: Option, + /// Version number incremented on each refresh, used by credential providers + /// to know when to re-extract credentials + version: u64, +} + +/// Unified accessor for storage options with optional caching provider +/// +/// This struct encapsulates both static storage options and an optional dynamic +/// provider. When both are present, options are merged with provider options +/// taking precedence over initial options. +/// +/// Thread-safe with double-check locking pattern to prevent concurrent refreshes. +pub struct StorageOptionsAccessor { + /// Initial/static storage options (always available immediately) + initial_options: Option>, + /// Optional dynamic provider for refreshable options + provider: Option>, + /// Cache for provider options (only used when provider is present) + provider_cache: Arc>>, + /// Duration before expiry to refresh options + refresh_offset: Duration, + /// Counter for generating unique version numbers + next_version: Arc, +} + +impl fmt::Debug for StorageOptionsAccessor { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("StorageOptionsAccessor") + .field("initial_options", &self.initial_options.is_some()) + .field("provider", &self.provider) + .field("refresh_offset", &self.refresh_offset) + .finish() + } +} + +// Hash is based on initial_options and provider's provider_id +impl std::hash::Hash for StorageOptionsAccessor { + fn hash(&self, state: &mut H) { + // Hash initial options in a deterministic order + if let Some(opts) = &self.initial_options { + let mut sorted_keys: Vec<_> = opts.keys().collect(); + sorted_keys.sort(); + for key in sorted_keys { + key.hash(state); + opts.get(key).hash(state); + } + } + // Hash provider's provider_id if present + if let Some(provider) = &self.provider { + provider.provider_id().hash(state); + } + } +} + +impl PartialEq for StorageOptionsAccessor { + fn eq(&self, other: &Self) -> bool { + self.initial_options == other.initial_options + && self.provider.as_ref().map(|p| p.provider_id()) + == other.provider.as_ref().map(|p| p.provider_id()) + } +} + +impl Eq for StorageOptionsAccessor {} + +impl StorageOptionsAccessor { + /// Create a new accessor with only static storage options (no provider) + /// + /// # Arguments + /// * `options` - Static storage options + pub fn new_with_options(options: HashMap) -> Self { + Self { + initial_options: Some(options), + provider: None, + provider_cache: Arc::new(RwLock::new(None)), + refresh_offset: Duration::from_secs(60), + next_version: Arc::new(std::sync::atomic::AtomicU64::new(1)), + } + } + + /// Create a new accessor with only a dynamic provider (no static options) + /// + /// # Arguments + /// * `provider` - The storage options provider + /// * `refresh_offset` - Duration before expiry to refresh options (e.g., 60 seconds) + pub fn new_with_provider( + provider: Arc, + refresh_offset: Duration, + ) -> Self { + Self { + initial_options: None, + provider: Some(provider), + provider_cache: Arc::new(RwLock::new(None)), + refresh_offset, + next_version: Arc::new(std::sync::atomic::AtomicU64::new(1)), + } + } + + /// Create a new accessor with both static options and a dynamic provider + /// + /// When getting options, provider options take precedence over initial options. + /// + /// # Arguments + /// * `initial_options` - Static storage options + /// * `provider` - The storage options provider + /// * `refresh_offset` - Duration before expiry to refresh options (e.g., 60 seconds) + pub fn new_with_options_and_provider( + initial_options: HashMap, + provider: Arc, + refresh_offset: Duration, + ) -> Self { + Self { + initial_options: Some(initial_options), + provider: Some(provider), + provider_cache: Arc::new(RwLock::new(None)), + refresh_offset, + next_version: Arc::new(std::sync::atomic::AtomicU64::new(1)), + } + } + + /// Create a new accessor with both static options and a dynamic provider, + /// with initial provider options pre-populated in the cache + /// + /// # Arguments + /// * `initial_options` - Static storage options + /// * `provider` - The storage options provider + /// * `refresh_offset` - Duration before expiry to refresh options + /// * `initial_provider_options` - Initial provider options to cache + pub fn new_with_initial_provider_cache( + initial_options: Option>, + provider: Arc, + refresh_offset: Duration, + initial_provider_options: HashMap, + ) -> Self { + let expires_at_millis: Option = initial_provider_options + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|v| v.parse().ok()); + Self { + initial_options, + provider: Some(provider), + provider_cache: Arc::new(RwLock::new(Some(CachedProviderOptions { + options: initial_provider_options, + expires_at_millis, + version: 1, + }))), + refresh_offset, + next_version: Arc::new(std::sync::atomic::AtomicU64::new(2)), + } + } + + /// Get current storage options, merging initial options with provider options. + /// Provider options take precedence over initial options. + pub async fn get_options(&self) -> Result>> { + let (opts, _version) = self.get_options_with_version().await?; + Ok(opts) + } + + /// Get current storage options with version number. + /// The version number increments on each provider refresh, allowing callers to detect changes. + pub async fn get_options_with_version(&self) -> Result<(Option>, u64)> { + // If no provider, just return initial options with version 0 + if self.provider.is_none() { + return Ok((self.initial_options.clone(), 0)); + } + + // Get provider options (with caching) + let (provider_opts, version) = self.get_provider_options_with_version().await?; + + // Merge: start with initial, extend with provider (provider takes precedence) + let merged = match (&self.initial_options, provider_opts) { + (None, None) => None, + (Some(initial), None) => Some(initial.clone()), + (None, Some(provider)) => Some(provider), + (Some(initial), Some(provider)) => { + let mut merged = initial.clone(); + merged.extend(provider); + Some(merged) + } + }; + + Ok((merged, version)) + } + + /// Get cached provider options, refreshing if expired. + async fn get_provider_options_with_version( + &self, + ) -> Result<(Option>, u64)> { + loop { + match self.do_get_provider_options().await? { + Some((opts, version)) => return Ok((opts, version)), + None => { + // Write lock was busy, wait and retry + tokio::time::sleep(Duration::from_millis(10)).await; + } + } + } + } + + /// Internal implementation that returns None if write lock is busy (signals retry) + async fn do_get_provider_options( + &self, + ) -> Result>, u64)>> { + let provider = self.provider.as_ref().expect("provider must be set"); + + // 1. Check cache with read lock (multiple readers allowed) + { + let cached = self.provider_cache.read().await; + if !self.is_cache_expired(&cached) { + return Ok(Some(match cached.as_ref() { + Some(c) => (Some(c.options.clone()), c.version), + None => (None, 0), + })); + } + } + + // 2. Try to acquire write lock - return None if busy (another thread refreshing) + let Ok(mut cache) = self.provider_cache.try_write() else { + return Ok(None); // Signal caller to retry + }; + + // 3. Double-check: another thread may have refreshed while we waited + if !self.is_cache_expired(&cache) { + return Ok(Some(match cache.as_ref() { + Some(c) => (Some(c.options.clone()), c.version), + None => (None, 0), + })); + } + + // 4. Actually refresh + log::debug!( + "Refreshing storage options from provider: {}", + provider.provider_id() + ); + + let fresh_options = provider.fetch_storage_options().await?; + let expires_at_millis: Option = fresh_options + .as_ref() + .and_then(|opts| opts.get(EXPIRES_AT_MILLIS_KEY)) + .and_then(|v| v.parse().ok()); + + // Get next version number + let version = self + .next_version + .fetch_add(1, std::sync::atomic::Ordering::SeqCst); + + if let Some(expires_at) = expires_at_millis { + let now_ms = current_time_millis(); + let expires_in_secs = expires_at.saturating_sub(now_ms) / 1000; + log::debug!( + "Successfully refreshed storage options from provider: {}, expires in {} seconds (version {})", + provider.provider_id(), + expires_in_secs, + version + ); + } else { + log::debug!( + "Successfully refreshed storage options from provider: {} (no expiration, version {})", + provider.provider_id(), + version + ); + } + + *cache = fresh_options.as_ref().map(|opts| CachedProviderOptions { + options: opts.clone(), + expires_at_millis, + version, + }); + + Ok(Some((fresh_options, version))) + } + + fn is_cache_expired(&self, cached: &Option) -> bool { + match cached { + None => true, + Some(c) => match c.expires_at_millis { + None => false, // No expiration = never expires + Some(expires_at) => { + let now_ms = current_time_millis(); + now_ms + self.refresh_offset.as_millis() as u64 >= expires_at + } + }, + } + } + + /// Check if this accessor has a provider + pub fn has_provider(&self) -> bool { + self.provider.is_some() + } + + /// Get the initial options (without provider options) + pub fn initial_options(&self) -> Option<&HashMap> { + self.initial_options.as_ref() + } + + /// Get the provider, if present + pub fn provider(&self) -> Option> { + self.provider.clone() + } + + /// Create a new accessor with updated initial options, preserving the provider + /// + /// This is useful when you need to merge additional options into an existing + /// accessor while keeping the same provider for dynamic refresh. + /// + /// # Arguments + /// * `new_initial_options` - The new initial options (replaces existing) + /// * `refresh_offset` - Duration before expiry to refresh options + pub fn with_updated_initial_options( + &self, + new_initial_options: HashMap, + refresh_offset: Duration, + ) -> Self { + Self { + initial_options: Some(new_initial_options), + provider: self.provider.clone(), + // Create new cache - we don't copy the old cache since credentials + // should be fresh from the provider + provider_cache: Arc::new(RwLock::new(None)), + refresh_offset, + next_version: Arc::new(std::sync::atomic::AtomicU64::new(1)), + } + } + + /// Create a new accessor by merging additional options with this accessor's initial options + /// and setting a new provider. + /// + /// The additional options take precedence over existing initial options. + /// + /// # Arguments + /// * `additional_options` - Options to merge (these take precedence) + /// * `provider` - The new storage options provider + /// * `refresh_offset` - Duration before expiry to refresh options + pub fn merge_with_provider( + &self, + additional_options: HashMap, + provider: Arc, + refresh_offset: Duration, + ) -> Self { + let mut merged = self.initial_options.clone().unwrap_or_default(); + merged.extend(additional_options); + Self { + initial_options: Some(merged), + provider: Some(provider), + provider_cache: Arc::new(RwLock::new(None)), + refresh_offset, + next_version: Arc::new(std::sync::atomic::AtomicU64::new(1)), + } + } + + // ========================================================================= + // Async helper methods for extracting configuration from merged options + // These use get_options() to get merged/refreshed options from the provider + // ========================================================================= + + /// Apply environment variable overrides to a HashMap of storage options + /// + /// This reads common environment variables and adds them to the options if present: + /// - AZURE_STORAGE_ALLOW_HTTP / AZURE_STORAGE_USE_HTTP -> allow_http + /// - AWS_ALLOW_HTTP -> allow_http + /// - OBJECT_STORE_CLIENT_MAX_RETRIES -> client_max_retries + /// - OBJECT_STORE_CLIENT_RETRY_TIMEOUT -> client_retry_timeout + pub fn apply_env_overrides(options: &mut HashMap) { + if let Ok(value) = std::env::var("AZURE_STORAGE_ALLOW_HTTP") { + options.insert("allow_http".into(), value); + } + if let Ok(value) = std::env::var("AZURE_STORAGE_USE_HTTP") { + options.insert("allow_http".into(), value); + } + if let Ok(value) = std::env::var("AWS_ALLOW_HTTP") { + options.insert("allow_http".into(), value); + } + if let Ok(value) = std::env::var("OBJECT_STORE_CLIENT_MAX_RETRIES") { + options.insert("client_max_retries".into(), value); + } + if let Ok(value) = std::env::var("OBJECT_STORE_CLIENT_RETRY_TIMEOUT") { + options.insert("client_retry_timeout".into(), value); + } + } + + /// Get a value from merged options by key (async, refreshes if needed) + pub async fn get(&self, key: &str) -> Result> { + let opts = self.get_options().await?; + Ok(opts.and_then(|o| o.get(key).cloned())) + } + + /// Get a value from initial options by key (synchronous, no provider refresh) + pub fn get_initial(&self, key: &str) -> Option<&String> { + self.initial_options.as_ref().and_then(|opts| opts.get(key)) + } + + /// Denotes if unsecure connections via http are allowed (async, uses merged options) + pub async fn allow_http(&self) -> Result { + let opts = self.get_options().await?; + Ok(opts + .map(|o| { + o.iter().any(|(key, value)| { + key.to_ascii_lowercase().contains("allow_http") && str_is_truthy(value) + }) + }) + .unwrap_or(false)) + } + + /// Number of times to retry a download that fails (async, uses merged options) + pub async fn download_retry_count(&self) -> Result { + let opts = self.get_options().await?; + Ok(opts + .and_then(|o| { + o.iter() + .find(|(key, _)| key.eq_ignore_ascii_case("download_retry_count")) + .map(|(_, value)| value.parse::().unwrap_or(3)) + }) + .unwrap_or(3)) + } + + /// Max retry times to set in RetryConfig for object store client (async, uses merged options) + pub async fn client_max_retries(&self) -> Result { + let opts = self.get_options().await?; + Ok(opts + .and_then(|o| { + o.iter() + .find(|(key, _)| key.eq_ignore_ascii_case("client_max_retries")) + .and_then(|(_, value)| value.parse::().ok()) + }) + .unwrap_or(10)) + } + + /// Seconds of timeout to set in RetryConfig for object store client (async, uses merged options) + pub async fn client_retry_timeout(&self) -> Result { + let opts = self.get_options().await?; + Ok(opts + .and_then(|o| { + o.iter() + .find(|(key, _)| key.eq_ignore_ascii_case("client_retry_timeout")) + .and_then(|(_, value)| value.parse::().ok()) + }) + .unwrap_or(180)) + } + + /// Get the expiration time in milliseconds since epoch, if present (async, uses merged options) + pub async fn expires_at_millis(&self) -> Result> { + let opts = self.get_options().await?; + Ok(opts.and_then(|o| { + o.get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()) + })) + } + + /// Check if use_opendal flag is set (async, uses merged options) + pub async fn use_opendal(&self) -> Result { + let opts = self.get_options().await?; + Ok(opts + .and_then(|o| o.get("use_opendal").map(|v| v.as_str() == "true")) + .unwrap_or(false)) + } + + /// Get DynamoDB endpoint for S3 commit lock (async, uses merged options, falls back to env var) + #[cfg(feature = "aws")] + pub async fn dynamodb_endpoint(&self) -> Result> { + let opts = self.get_options().await?; + let from_opts = opts.and_then(|o| { + o.iter() + .find(|(key, _)| key.eq_ignore_ascii_case("dynamodb_endpoint")) + .map(|(_, value)| value.clone()) + }); + // Fall back to environment variable if not in options + Ok(from_opts.or_else(|| std::env::var("DYNAMODB_ENDPOINT").ok())) + } + + /// Get Hugging Face token (async, uses merged options, checks env vars as fallback) + pub async fn hf_token(&self) -> Result> { + let opts = self.get_options().await?; + Ok(opts + .and_then(|o| o.get("hf_token").cloned()) + .or_else(|| std::env::var("HF_TOKEN").ok()) + .or_else(|| std::env::var("HUGGINGFACE_TOKEN").ok())) + } + + /// Get Hugging Face revision (async, uses merged options) + pub async fn hf_revision(&self) -> Result> { + let opts = self.get_options().await?; + Ok(opts.and_then(|o| o.get("hf_revision").cloned())) + } + + /// Get Hugging Face root path (async, uses merged options) + pub async fn hf_root(&self) -> Result> { + let opts = self.get_options().await?; + Ok(opts.and_then(|o| o.get("hf_root").cloned())) + } + + // ========================================================================= + // AWS S3 provider-specific methods + // ========================================================================= + + /// Add values from the environment to storage options for S3 + #[cfg(feature = "aws")] + pub fn apply_env_s3(options: &mut HashMap) { + for (os_key, os_value) in std::env::vars_os() { + if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { + if let Ok(config_key) = AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()) { + if !options.contains_key(config_key.as_ref()) { + options.insert(config_key.as_ref().to_string(), value.to_string()); + } + } + } + } + } + + /// Convert storage options to S3-specific options map (static helper) + #[cfg(feature = "aws")] + pub fn as_s3_options(options: &HashMap) -> HashMap { + options + .iter() + .filter_map(|(key, value)| { + let s3_key = AmazonS3ConfigKey::from_str(&key.to_ascii_lowercase()).ok()?; + Some((s3_key, value.clone())) + }) + .collect() + } + + /// Get merged storage options as S3-specific options map (async, refreshes if needed) + #[cfg(feature = "aws")] + pub async fn get_s3_options(&self) -> Result> { + let opts = self.get_options().await?.unwrap_or_default(); + Ok(Self::as_s3_options(&opts)) + } + + /// Get merged storage options with S3 env vars applied (async, refreshes if needed) + /// + /// This method: + /// 1. Gets merged options from initial + provider + /// 2. Applies common env overrides + /// 3. Applies S3-specific env vars + #[cfg(feature = "aws")] + pub async fn get_options_with_s3_env(&self) -> Result> { + let mut opts = self.get_options().await?.unwrap_or_default(); + Self::apply_env_overrides(&mut opts); + Self::apply_env_s3(&mut opts); + Ok(opts) + } + + /// Get merged storage options as S3-specific options map with env vars applied + #[cfg(feature = "aws")] + pub async fn get_s3_options_with_env(&self) -> Result> { + let opts = self.get_options_with_s3_env().await?; + Ok(Self::as_s3_options(&opts)) + } + + /// Check if use_opendal flag is set for S3 + #[cfg(feature = "aws")] + pub async fn s3_use_opendal(&self) -> Result { + let opts = self.get_options_with_s3_env().await?; + Ok(opts + .get("use_opendal") + .map(|v| v == "true") + .unwrap_or(false)) + } + + /// Check if S3 Express mode is enabled + #[cfg(feature = "aws")] + pub async fn s3_express(&self) -> Result { + let opts = self.get_options_with_s3_env().await?; + Ok(opts.get("s3_express").map(|v| v == "true").unwrap_or(false)) + } + + /// Check if this is Cloudflare R2 (requires constant size upload parts) + #[cfg(feature = "aws")] + pub async fn is_cloudflare_r2(&self) -> Result { + let opts = self.get_options_with_s3_env().await?; + Ok(opts + .get("aws_endpoint") + .map(|endpoint| endpoint.contains("r2.cloudflarestorage.com")) + .unwrap_or(false)) + } + + // ========================================================================= + // Azure provider-specific methods + // ========================================================================= + + /// Create storage options from environment variables for Azure + #[cfg(feature = "azure")] + pub fn from_env_azure() -> HashMap { + let mut opts = HashMap::::new(); + for (os_key, os_value) in std::env::vars_os() { + if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { + if let Ok(config_key) = AzureConfigKey::from_str(&key.to_ascii_lowercase()) { + opts.insert(config_key.as_ref().to_string(), value.to_string()); + } + } + } + opts + } + + /// Add values from the environment to storage options for Azure + #[cfg(feature = "azure")] + pub fn apply_env_azure( + options: &mut HashMap, + env_options: &HashMap, + ) { + for (os_key, os_value) in env_options { + if !options.contains_key(os_key) { + options.insert(os_key.clone(), os_value.clone()); + } + } + } + + /// Convert storage options to Azure-specific options map (static helper) + #[cfg(feature = "azure")] + pub fn as_azure_options(options: &HashMap) -> HashMap { + options + .iter() + .filter_map(|(key, value)| { + let az_key = AzureConfigKey::from_str(&key.to_ascii_lowercase()).ok()?; + Some((az_key, value.clone())) + }) + .collect() + } + + /// Get merged storage options as Azure-specific options map (async, refreshes if needed) + #[cfg(feature = "azure")] + pub async fn get_azure_options(&self) -> Result> { + let opts = self.get_options().await?.unwrap_or_default(); + Ok(Self::as_azure_options(&opts)) + } + + /// Get merged storage options with Azure env vars applied (async, refreshes if needed) + #[cfg(feature = "azure")] + pub async fn get_options_with_azure_env(&self) -> Result> { + let env_options = Self::from_env_azure(); + let mut opts = self.get_options().await?.unwrap_or_default(); + Self::apply_env_overrides(&mut opts); + Self::apply_env_azure(&mut opts, &env_options); + Ok(opts) + } + + /// Get merged storage options as Azure-specific options map with env vars applied + #[cfg(feature = "azure")] + pub async fn get_azure_options_with_env(&self) -> Result> { + let opts = self.get_options_with_azure_env().await?; + Ok(Self::as_azure_options(&opts)) + } + + /// Check if use_opendal flag is set for Azure + #[cfg(feature = "azure")] + pub async fn azure_use_opendal(&self) -> Result { + let opts = self.get_options_with_azure_env().await?; + Ok(opts + .get("use_opendal") + .map(|v| v == "true") + .unwrap_or(false)) + } + + /// Find the configured Azure storage account name from options (static helper) + #[cfg(feature = "azure")] + #[allow(clippy::manual_map)] + pub fn find_configured_storage_account(options: &HashMap) -> Option { + if let Some(account) = options.get("azure_storage_account_name") { + Some(account.clone()) + } else if let Some(account) = options.get("account_name") { + Some(account.clone()) + } else { + None + } + } + + /// Get the configured Azure storage account name (async, uses merged options with env) + #[cfg(feature = "azure")] + pub async fn get_azure_storage_account(&self) -> Result> { + let opts = self.get_options_with_azure_env().await?; + Ok(Self::find_configured_storage_account(&opts)) + } + + // ========================================================================= + // GCS provider-specific methods + // ========================================================================= + + /// Add values from the environment to storage options for GCS + #[cfg(feature = "gcp")] + pub fn apply_env_gcs(options: &mut HashMap) { + for (os_key, os_value) in std::env::vars_os() { + if let (Some(key), Some(value)) = (os_key.to_str(), os_value.to_str()) { + let lowercase_key = key.to_ascii_lowercase(); + let token_key = "google_storage_token"; + + if let Ok(config_key) = GoogleConfigKey::from_str(&lowercase_key) { + if !options.contains_key(config_key.as_ref()) { + options.insert(config_key.as_ref().to_string(), value.to_string()); + } + } + // Check for GOOGLE_STORAGE_TOKEN until GoogleConfigKey supports storage token + else if lowercase_key == token_key && !options.contains_key(token_key) { + options.insert(token_key.to_string(), value.to_string()); + } + } + } + } + + /// Convert storage options to GCS-specific options map (static helper) + #[cfg(feature = "gcp")] + pub fn as_gcs_options(options: &HashMap) -> HashMap { + options + .iter() + .filter_map(|(key, value)| { + let gcs_key = GoogleConfigKey::from_str(&key.to_ascii_lowercase()).ok()?; + Some((gcs_key, value.clone())) + }) + .collect() + } + + /// Get merged storage options as GCS-specific options map (async, refreshes if needed) + #[cfg(feature = "gcp")] + pub async fn get_gcs_options(&self) -> Result> { + let opts = self.get_options().await?.unwrap_or_default(); + Ok(Self::as_gcs_options(&opts)) + } + + /// Get merged storage options with GCS env vars applied (async, refreshes if needed) + #[cfg(feature = "gcp")] + pub async fn get_options_with_gcs_env(&self) -> Result> { + let mut opts = self.get_options().await?.unwrap_or_default(); + Self::apply_env_overrides(&mut opts); + Self::apply_env_gcs(&mut opts); + Ok(opts) + } + + /// Get merged storage options as GCS-specific options map with env vars applied + #[cfg(feature = "gcp")] + pub async fn get_gcs_options_with_env(&self) -> Result> { + let opts = self.get_options_with_gcs_env().await?; + Ok(Self::as_gcs_options(&opts)) + } + + /// Check if use_opendal flag is set for GCS + #[cfg(feature = "gcp")] + pub async fn gcs_use_opendal(&self) -> Result { + let opts = self.get_options_with_gcs_env().await?; + Ok(opts + .get("use_opendal") + .map(|v| v == "true") + .unwrap_or(false)) + } + + /// Get Google storage token (async, uses merged options with env) + #[cfg(feature = "gcp")] + pub async fn google_storage_token(&self) -> Result> { + let opts = self.get_options_with_gcs_env().await?; + Ok(opts.get("google_storage_token").cloned()) + } + + // ========================================================================= + // OSS provider-specific methods + // ========================================================================= + + /// Get OSS endpoint (async, uses merged options) + pub async fn oss_endpoint(&self) -> Result> { + let opts = self.get_options().await?.unwrap_or_default(); + Ok(opts.get("oss_endpoint").cloned()) + } + + /// Get OSS access key ID (async, uses merged options) + pub async fn oss_access_key_id(&self) -> Result> { + let opts = self.get_options().await?.unwrap_or_default(); + Ok(opts.get("oss_access_key_id").cloned()) + } + + /// Get OSS secret access key (async, uses merged options) + pub async fn oss_secret_access_key(&self) -> Result> { + let opts = self.get_options().await?.unwrap_or_default(); + Ok(opts.get("oss_secret_access_key").cloned()) + } + + /// Get OSS region (async, uses merged options) + pub async fn oss_region(&self) -> Result> { + let opts = self.get_options().await?.unwrap_or_default(); + Ok(opts.get("oss_region").cloned()) + } +} + +/// Check if a string value is truthy (true, 1, yes, on) +fn str_is_truthy(val: &str) -> bool { + let lower = val.to_ascii_lowercase(); + lower == "1" || lower == "true" || lower == "on" || lower == "yes" +} + +#[cfg(test)] +mod tests { + use super::*; + use mock_instant::thread_local::MockClock; + + #[derive(Debug)] + struct MockStorageOptionsProvider { + call_count: Arc>, + expires_in_millis: Option, + } + + impl MockStorageOptionsProvider { + fn new(expires_in_millis: Option) -> Self { + Self { + call_count: Arc::new(RwLock::new(0)), + expires_in_millis, + } + } + + async fn get_call_count(&self) -> usize { + *self.call_count.read().await + } + } + + #[async_trait] + impl StorageOptionsProvider for MockStorageOptionsProvider { + async fn fetch_storage_options(&self) -> Result>> { + let count = { + let mut c = self.call_count.write().await; + *c += 1; + *c + }; + + let mut options = HashMap::from([ + ("key".to_string(), format!("value_{}", count)), + ("count".to_string(), count.to_string()), + ]); + + if let Some(expires_in) = self.expires_in_millis { + let now_ms = MockClock::system_time().as_millis() as u64; + let expires_at = now_ms + expires_in; + options.insert(EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()); + } + + Ok(Some(options)) + } + + fn provider_id(&self) -> String { + let ptr = Arc::as_ptr(&self.call_count) as usize; + format!("MockStorageOptionsProvider {{ id: {} }}", ptr) + } + } + + #[tokio::test] + async fn test_accessor_initial_fetch() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + // Create a mock provider that returns options expiring in 10 minutes + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + // Create accessor with only a provider (no initial options) + let accessor = StorageOptionsAccessor::new_with_provider( + mock_provider.clone(), + Duration::from_secs(300), + ); + + // First call should fetch from provider + let opts = accessor.get_options().await.unwrap().unwrap(); + assert_eq!(opts.get("key"), Some(&"value_1".to_string())); + assert_eq!(opts.get("count"), Some(&"1".to_string())); + assert_eq!(mock_provider.get_call_count().await, 1); + + // Second call should use cached options (not expired yet) + let opts = accessor.get_options().await.unwrap().unwrap(); + assert_eq!(opts.get("key"), Some(&"value_1".to_string())); + assert_eq!(mock_provider.get_call_count().await, 1); // Still 1, didn't fetch again + } + + #[tokio::test] + async fn test_accessor_with_initial_provider_cache() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let now_ms = MockClock::system_time().as_millis() as u64; + + // Create a mock provider + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + // Create initial provider options that expire in 10 minutes + let expires_at = now_ms + 600_000; + let initial_provider_options = HashMap::from([ + ("key".to_string(), "initial_value".to_string()), + (EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()), + ]); + + // Create accessor with initial provider cache + let accessor = StorageOptionsAccessor::new_with_initial_provider_cache( + None, + mock_provider.clone(), + Duration::from_secs(300), + initial_provider_options, + ); + + // First call should use initial cached options + let opts = accessor.get_options().await.unwrap().unwrap(); + assert_eq!(opts.get("key"), Some(&"initial_value".to_string())); + + // Should not have called the provider yet + assert_eq!(mock_provider.get_call_count().await, 0); + } + + #[tokio::test] + async fn test_accessor_refresh_on_expiration() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + // Create a mock provider that returns options expiring in 10 minutes + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + // Create accessor with 5 minute refresh offset + let accessor = StorageOptionsAccessor::new_with_provider( + mock_provider.clone(), + Duration::from_secs(300), + ); + + // First call should fetch from provider + let opts = accessor.get_options().await.unwrap().unwrap(); + assert_eq!(opts.get("count"), Some(&"1".to_string())); + assert_eq!(mock_provider.get_call_count().await, 1); + + // Advance time to 6 minutes - should trigger refresh (within 5 min refresh offset) + MockClock::set_system_time(Duration::from_secs(100_000 + 360)); + let opts = accessor.get_options().await.unwrap().unwrap(); + assert_eq!(opts.get("count"), Some(&"2".to_string())); + assert_eq!(mock_provider.get_call_count().await, 2); + + // Advance time to 11 minutes total - should trigger another refresh + MockClock::set_system_time(Duration::from_secs(100_000 + 660)); + let opts = accessor.get_options().await.unwrap().unwrap(); + assert_eq!(opts.get("count"), Some(&"3".to_string())); + assert_eq!(mock_provider.get_call_count().await, 3); + } + + #[tokio::test] + async fn test_accessor_refresh_lead_time() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + // Create a mock provider that returns options expiring in 4 minutes + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(240_000))); + + // Create accessor with 5 minute refresh offset + // This means options should be refreshed when they have less than 5 minutes left + let accessor = StorageOptionsAccessor::new_with_provider( + mock_provider.clone(), + Duration::from_secs(300), + ); + + // First call should fetch (no initial cache) + // Options expire in 4 minutes, which is less than our 5 minute refresh offset, + // so they should be considered "needs refresh" immediately + let opts = accessor.get_options().await.unwrap().unwrap(); + assert_eq!(opts.get("count"), Some(&"1".to_string())); + assert_eq!(mock_provider.get_call_count().await, 1); + + // Second call should trigger refresh because options expire in 4 minutes + // but our refresh lead time is 5 minutes (now + 5min > expires_at) + let opts = accessor.get_options().await.unwrap().unwrap(); + assert_eq!(opts.get("count"), Some(&"2".to_string())); + assert_eq!(mock_provider.get_call_count().await, 2); + } + + #[tokio::test] + async fn test_accessor_no_expiration() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + // Create a mock provider that returns options WITHOUT expiration + let mock_provider = Arc::new(MockStorageOptionsProvider::new(None)); + + // Create accessor + let accessor = StorageOptionsAccessor::new_with_provider( + mock_provider.clone(), + Duration::from_secs(300), + ); + + // First call should fetch + let opts = accessor.get_options().await.unwrap().unwrap(); + assert_eq!(opts.get("count"), Some(&"1".to_string())); + assert_eq!(mock_provider.get_call_count().await, 1); + + // Advance time significantly - should NOT trigger refresh (no expiration) + MockClock::set_system_time(Duration::from_secs(100_000 + 3600)); // 1 hour later + let opts = accessor.get_options().await.unwrap().unwrap(); + assert_eq!(opts.get("count"), Some(&"1".to_string())); + assert_eq!(mock_provider.get_call_count().await, 1); // Still 1, didn't refresh + } + + #[tokio::test] + async fn test_accessor_concurrent_access() { + // Create a mock provider with far future expiration + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(9999999999999))); + + let accessor = Arc::new(StorageOptionsAccessor::new_with_provider( + mock_provider.clone(), + Duration::from_secs(300), + )); + + // Spawn 10 concurrent tasks that all try to get options at the same time + let mut handles = vec![]; + for i in 0..10 { + let accessor = accessor.clone(); + let handle = tokio::spawn(async move { + let opts = accessor.get_options().await.unwrap().unwrap(); + // Verify we got the correct options (should all be count=1 from first fetch) + assert_eq!(opts.get("count"), Some(&"1".to_string())); + i // Return task number for verification + }); + handles.push(handle); + } + + // Wait for all tasks to complete + let results: Vec<_> = futures::future::join_all(handles) + .await + .into_iter() + .map(|r| r.unwrap()) + .collect(); + + // Verify all 10 tasks completed successfully + assert_eq!(results.len(), 10); + for i in 0..10 { + assert!(results.contains(&i)); + } + + // The provider should have been called exactly once (first request triggers fetch, + // subsequent requests use cache) + let call_count = mock_provider.get_call_count().await; + assert_eq!( + call_count, 1, + "Provider should be called exactly once despite concurrent access" + ); + } + + #[tokio::test] + async fn test_accessor_concurrent_refresh() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let now_ms = MockClock::system_time().as_millis() as u64; + + // Create initial options that expired in the past + let expires_at = now_ms - 1_000_000; + let initial_provider_options = HashMap::from([ + ("key".to_string(), "old_value".to_string()), + (EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()), + ]); + + // Mock will return options expiring in 1 hour + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(3_600_000))); + + let accessor = Arc::new(StorageOptionsAccessor::new_with_initial_provider_cache( + None, + mock_provider.clone(), + Duration::from_secs(300), + initial_provider_options, + )); + + // Spawn 20 concurrent tasks that all try to get options at the same time + // Since the initial options are expired, they'll all try to refresh + let mut handles = vec![]; + for i in 0..20 { + let accessor = accessor.clone(); + let handle = tokio::spawn(async move { + let opts = accessor.get_options().await.unwrap().unwrap(); + // All should get the new options (count=1 from first fetch) + assert_eq!(opts.get("count"), Some(&"1".to_string())); + i + }); + handles.push(handle); + } + + // Wait for all tasks to complete + let results: Vec<_> = futures::future::join_all(handles) + .await + .into_iter() + .map(|r| r.unwrap()) + .collect(); + + // Verify all 20 tasks completed successfully + assert_eq!(results.len(), 20); + + // The provider should have been called at least once, but possibly more times + // due to the try_write mechanism and race conditions + let call_count = mock_provider.get_call_count().await; + assert!( + call_count >= 1, + "Provider should be called at least once, was called {} times", + call_count + ); + + // It shouldn't be called 20 times though - the lock should prevent most concurrent fetches + assert!( + call_count < 10, + "Provider should not be called too many times due to lock contention, was called {} times", + call_count + ); + } + + #[tokio::test] + async fn test_accessor_static_options_only() { + // Create accessor with only static options (no provider) + let static_options = HashMap::from([ + ("key1".to_string(), "value1".to_string()), + ("key2".to_string(), "value2".to_string()), + ]); + + let accessor = StorageOptionsAccessor::new_with_options(static_options); + + // Should return static options immediately + let opts = accessor.get_options().await.unwrap().unwrap(); + assert_eq!(opts.get("key1"), Some(&"value1".to_string())); + assert_eq!(opts.get("key2"), Some(&"value2".to_string())); + + // Version should be 0 (no provider) + let (opts, version) = accessor.get_options_with_version().await.unwrap(); + assert_eq!(version, 0); + assert!(opts.is_some()); + } + + #[tokio::test] + async fn test_accessor_merges_initial_and_provider_options() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + // Create static options + let initial_options = HashMap::from([ + ("static_key".to_string(), "static_value".to_string()), + ("shared_key".to_string(), "initial_value".to_string()), + ]); + + // Provider returns different options + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + // Create accessor with both + let accessor = StorageOptionsAccessor::new_with_options_and_provider( + initial_options, + mock_provider, + Duration::from_secs(300), + ); + + // Should merge: initial + provider (provider takes precedence for shared keys) + let opts = accessor.get_options().await.unwrap().unwrap(); + + // Static key should be present + assert_eq!(opts.get("static_key"), Some(&"static_value".to_string())); + + // Provider keys should be present + assert_eq!(opts.get("key"), Some(&"value_1".to_string())); + assert_eq!(opts.get("count"), Some(&"1".to_string())); + } +} diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index fdb4370f6ab..6f4af1c5602 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -14,7 +14,9 @@ use async_trait::async_trait; use bytes::Bytes; use lance::dataset::{Dataset, WriteParams}; use lance::session::Session; -use lance_io::object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry}; +use lance_io::object_store::{ + ObjectStore, ObjectStoreParams, ObjectStoreRegistry, StorageOptionsAccessor, +}; use object_store::path::Path; use std::collections::HashMap; use std::io::Cursor; @@ -319,7 +321,9 @@ impl DirectoryNamespaceBuilder { ) -> Result<(Arc, Path)> { // Build ObjectStoreParams from storage options let params = ObjectStoreParams { - storage_options: storage_options.clone(), + storage_options_accessor: storage_options.as_ref().map(|opts| { + std::sync::Arc::new(StorageOptionsAccessor::new_with_options(opts.clone())) + }), ..Default::default() }; @@ -932,7 +936,9 @@ impl LanceNamespace for DirectoryNamespace { }; let store_params = self.storage_options.as_ref().map(|opts| ObjectStoreParams { - storage_options: Some(opts.clone()), + storage_options_accessor: Some(std::sync::Arc::new( + StorageOptionsAccessor::new_with_options(opts.clone()), + )), ..Default::default() }); diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index 4791bbb9df5..0bf263e0021 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -21,7 +21,7 @@ use lance_index::optimize::OptimizeOptions; use lance_index::scalar::{BuiltinIndexType, ScalarIndexParams}; use lance_index::traits::DatasetIndexExt; use lance_index::IndexType; -use lance_io::object_store::{ObjectStore, ObjectStoreParams}; +use lance_io::object_store::{ObjectStore, ObjectStoreParams, StorageOptionsAccessor}; use lance_namespace::models::{ CreateEmptyTableRequest, CreateEmptyTableResponse, CreateNamespaceRequest, CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, DeregisterTableRequest, @@ -981,7 +981,9 @@ impl ManifestNamespace { let write_params = WriteParams { session, store_params: storage_options.as_ref().map(|opts| ObjectStoreParams { - storage_options: Some(opts.clone()), + storage_options_accessor: Some(std::sync::Arc::new( + StorageOptionsAccessor::new_with_options(opts.clone()), + )), ..Default::default() }), ..Default::default() diff --git a/rust/lance-table/src/io/commit.rs b/rust/lance-table/src/io/commit.rs index be650beac50..ab8b1de035a 100644 --- a/rust/lance-table/src/io/commit.rs +++ b/rust/lance-table/src/io/commit.rs @@ -60,7 +60,7 @@ use { self::external_manifest::{ExternalManifestCommitHandler, ExternalManifestStore}, aws_credential_types::provider::error::CredentialsError, aws_credential_types::provider::ProvideCredentials, - lance_io::object_store::{providers::aws::build_aws_credential, StorageOptions}, + lance_io::object_store::{providers::aws::build_aws_credential, StorageOptionsAccessor}, object_store::aws::AmazonS3ConfigKey, object_store::aws::AwsCredentialProvider, std::borrow::Cow, @@ -765,28 +765,28 @@ pub async fn commit_handler_from_url( } }; let options = options.clone().unwrap_or_default(); - let storage_options = StorageOptions(options.storage_options.unwrap_or_default()); - let dynamo_endpoint = get_dynamodb_endpoint(&storage_options); - let expires_at_millis = storage_options.expires_at_millis(); - let storage_options = storage_options.as_s3_options(); + let accessor = options.accessor(); + let dynamo_endpoint = accessor.dynamodb_endpoint().await?; + let s3_storage_options = accessor.get_s3_options_with_env().await?; - let region = storage_options.get(&AmazonS3ConfigKey::Region).cloned(); + let region = s3_storage_options.get(&AmazonS3ConfigKey::Region).cloned(); + let storage_options_provider = accessor.provider(); - let (aws_creds, region) = build_aws_credential( + let cred_result = build_aws_credential( options.s3_credentials_refresh_offset, options.aws_credentials.clone(), - Some(&storage_options), + Some(&s3_storage_options), region, - options.storage_options_provider.clone(), - expires_at_millis, + storage_options_provider, + None, // expires_at_millis ) .await?; Ok(Arc::new(ExternalManifestCommitHandler { external_manifest_store: build_dynamodb_external_store( table_name, - aws_creds.clone(), - ®ion, + cred_result.credential_provider.clone(), + &cred_result.region, dynamo_endpoint, "lancedb", ) @@ -797,15 +797,6 @@ pub async fn commit_handler_from_url( } } -#[cfg(feature = "dynamodb")] -fn get_dynamodb_endpoint(storage_options: &StorageOptions) -> Option { - if let Some(endpoint) = storage_options.0.get("dynamodb_endpoint") { - Some(endpoint.clone()) - } else { - std::env::var("DYNAMODB_ENDPOINT").ok() - } -} - /// Errors that can occur when committing a manifest. #[derive(Debug)] pub enum CommitError { diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 26d4b16faca..d2c01a13a0c 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -171,6 +171,7 @@ pub struct Dataset { /// Object store parameters used when opening this dataset. /// These are used when creating object stores for additional base paths. + /// Also contains the storage_options_accessor for efficient repeated access. pub(crate) store_params: Option>, } @@ -807,12 +808,26 @@ impl Dataset { /// For CREATE mode, calls create_empty_table() to initialize the table. /// For other modes, calls describe_table() and opens dataset with namespace credentials. /// + /// # Storage Options Handling + /// + /// Users only need to set basic additional storage options (e.g., region, endpoint overrides) + /// in `params.store_params`. This function will automatically: + /// + /// - Fetch storage options from the namespace (credentials, bucket info, etc.) + /// - Merge user-provided options with namespace options (namespace options take precedence) + /// - Set up a [`StorageOptionsAccessor`] with a provider for automatic credential refresh + /// + /// The namespace-provided options are cached and automatically refreshed before expiration + /// based on the `expires_at_millis` key returned by the namespace. + /// /// # Arguments /// /// * `batches` - The record batches to write /// * `namespace` - The namespace to use for table management /// * `table_id` - The table identifier - /// * `params` - Write parameters + /// * `params` - Write parameters. Only basic storage options need to be set in + /// `store_params`; namespace-related configs (credentials, storage options provider) + /// are configured automatically by this function. /// * `ignore_namespace_table_storage_options` - If true, ignore storage options returned /// by the namespace and only use the storage options in params. The storage options /// provider will not be created, so credentials will not be automatically refreshed. @@ -855,18 +870,16 @@ impl Dataset { namespace, table_id, )); - // Merge namespace storage options with any existing options - let mut merged_options = write_params - .store_params - .as_ref() - .and_then(|p| p.storage_options.clone()) - .unwrap_or_default(); - merged_options.extend(namespace_storage_options); - + // Merge namespace storage options with existing options using accessor let existing_params = write_params.store_params.take().unwrap_or_default(); + let existing_accessor = existing_params.accessor(); + let merged_accessor = Arc::new(existing_accessor.merge_with_provider( + namespace_storage_options, + provider, + existing_params.s3_credentials_refresh_offset, + )); write_params.store_params = Some(ObjectStoreParams { - storage_options: Some(merged_options), - storage_options_provider: Some(provider), + storage_options_accessor: Some(merged_accessor), ..existing_params }); } @@ -904,18 +917,16 @@ impl Dataset { table_id.clone(), )); - // Merge namespace storage options with any existing options - let mut merged_options = write_params - .store_params - .as_ref() - .and_then(|p| p.storage_options.clone()) - .unwrap_or_default(); - merged_options.extend(namespace_storage_options); - + // Merge namespace storage options with existing options using accessor let existing_params = write_params.store_params.take().unwrap_or_default(); + let existing_accessor = existing_params.accessor(); + let merged_accessor = Arc::new(existing_accessor.merge_with_provider( + namespace_storage_options, + provider, + existing_params.s3_credentials_refresh_offset, + )); write_params.store_params = Some(ObjectStoreParams { - storage_options: Some(merged_options), - storage_options_provider: Some(provider), + storage_options_accessor: Some(merged_accessor), ..existing_params }); } @@ -926,11 +937,18 @@ impl Dataset { // assumes no dataset exists and converts the mode to CREATE. let mut builder = DatasetBuilder::from_uri(uri.as_str()); if let Some(ref store_params) = write_params.store_params { - if let Some(ref storage_options) = store_params.storage_options { - builder = builder.with_storage_options(storage_options.clone()); - } - if let Some(ref provider) = store_params.storage_options_provider { - builder = builder.with_storage_options_provider(provider.clone()); + // Copy storage_options_accessor from store_params if present + // The accessor already contains both the initial options and provider + if let Some(ref accessor) = store_params.storage_options_accessor { + // Get initial storage options from the accessor + let initial_options = accessor.initial_options(); + if let Some(opts) = initial_options { + builder = builder.with_storage_options(opts.clone()); + } + // If the accessor has a provider, set that too + if let Some(provider) = accessor.provider() { + builder = builder.with_storage_options_provider(provider); + } } } let dataset = Arc::new(builder.load().await?); @@ -1592,19 +1610,38 @@ impl Dataset { } /// Returns the storage options used when opening this dataset, if any. + /// Returns the initial storage options (without provider options). pub fn storage_options(&self) -> Option<&HashMap> { self.store_params .as_ref() - .and_then(|params| params.storage_options.as_ref()) + .and_then(|p| p.storage_options_accessor.as_ref()) + .and_then(|a| a.initial_options()) } - /// Returns the storage options provider used when opening this dataset, if any. - pub fn storage_options_provider( + /// Returns the storage options accessor used when opening this dataset, if any. + pub fn storage_options_accessor( &self, - ) -> Option> { + ) -> Option> { self.store_params .as_ref() - .and_then(|params| params.storage_options_provider.clone()) + .and_then(|p| p.storage_options_accessor.clone()) + } + + /// Returns the current storage options, combining the initial storage_options + /// with any overrides from the storage_options_provider (cached until expiration). + /// + /// Provider options take precedence over initial options when keys conflict. + /// The options from the provider are cached and automatically refreshed when + /// they expire (based on the `expires_at_millis` key). + /// + /// If neither storage_options nor storage_options_provider were specified when + /// opening the dataset, an empty HashMap is returned. + pub async fn current_storage_options(&self) -> Result> { + if let Some(accessor) = self.storage_options_accessor() { + Ok(accessor.get_options().await?.unwrap_or_default()) + } else { + Ok(HashMap::new()) + } } pub fn data_dir(&self) -> Path { diff --git a/rust/lance/src/dataset/builder.rs b/rust/lance/src/dataset/builder.rs index 332ba504cf9..1d47f1db56e 100644 --- a/rust/lance/src/dataset/builder.rs +++ b/rust/lance/src/dataset/builder.rs @@ -11,7 +11,7 @@ use lance_core::utils::tracing::{DATASET_LOADING_EVENT, TRACE_DATASET_EVENTS}; use lance_file::datatypes::populate_schema_dictionary; use lance_file::reader::FileReaderOptions; use lance_io::object_store::{ - LanceNamespaceStorageOptionsProvider, ObjectStore, ObjectStoreParams, StorageOptions, + LanceNamespaceStorageOptionsProvider, ObjectStore, ObjectStoreParams, StorageOptionsAccessor, DEFAULT_CLOUD_IO_PARALLELISM, }; use lance_namespace::models::DescribeTableRequest; @@ -46,6 +46,10 @@ pub struct DatasetBuilder { file_reader_options: Option, /// Storage options that override user-provided options (e.g., from namespace) storage_options_override: Option>, + /// User-provided storage options (accumulated via with_storage_options/with_storage_option) + storage_options: HashMap, + /// User-provided storage options provider + storage_options_provider: Option>, } impl std::fmt::Debug for DatasetBuilder { @@ -63,11 +67,46 @@ impl std::fmt::Debug for DatasetBuilder { "storage_options_override", &self.storage_options_override.is_some(), ) + .field("storage_options", &!self.storage_options.is_empty()) + .field( + "storage_options_provider", + &self.storage_options_provider.is_some(), + ) .finish() } } impl DatasetBuilder { + /// Create a DatasetBuilder from a URI. + /// + /// This is the basic constructor for opening a dataset from a URI. + /// + /// # Example: Basic usage + /// ```ignore + /// let dataset = DatasetBuilder::from_uri("s3://bucket/table.lance") + /// .load() + /// .await?; + /// ``` + /// + /// # Example: With custom storage options provider + /// + /// If you need to use a specific URI with namespace-based credential refresh, + /// use [`with_storage_options_provider`](Self::with_storage_options_provider): + /// + /// ```ignore + /// use std::sync::Arc; + /// use lance_io::object_store::LanceNamespaceStorageOptionsProvider; + /// + /// let provider = Arc::new(LanceNamespaceStorageOptionsProvider::new( + /// namespace, + /// vec!["my_table".to_string()], + /// )); + /// + /// let dataset = DatasetBuilder::from_uri("s3://bucket/table.lance") + /// .with_storage_options_provider(provider) + /// .load() + /// .await?; + /// ``` pub fn from_uri>(table_uri: T) -> Self { Self { index_cache_size_bytes: DEFAULT_INDEX_CACHE_SIZE, @@ -80,54 +119,78 @@ impl DatasetBuilder { manifest: None, file_reader_options: None, storage_options_override: None, + storage_options: HashMap::new(), + storage_options_provider: None, } } - /// Create a DatasetBuilder from a LanceNamespace + /// Create a DatasetBuilder from a LanceNamespace. /// - /// This will automatically fetch the table location and storage options from the namespace - /// via `describe_table()`. + /// When called, this method automatically: /// - /// Storage options from the namespace will override any user-provided storage options - /// set via `.with_storage_options()`. This ensures the namespace is always the source - /// of truth for storage options. + /// 1. Calls `describe_table()` on the namespace to get table metadata + /// 2. Uses the returned location as the dataset URI + /// 3. Stores namespace storage options to override user-provided storage options + /// (namespace options take precedence) + /// 4. Creates a [`LanceNamespaceStorageOptionsProvider`] for automatic credential refresh /// /// # Arguments /// * `namespace` - The namespace implementation to fetch table info from - /// * `table_id` - The table identifier (e.g., vec!["my_table"]) + /// * `table_id` - The table identifier (e.g., `vec!["my_table".to_string()]`) /// * `ignore_namespace_table_storage_options` - If true, storage options returned from - /// the namespace's `describe_table()` will be ignored (treated as None). Defaults to false. + /// the namespace's `describe_table()` will be ignored (treated as None) /// - /// # Example + /// # Example: Basic usage /// ```ignore /// use lance_namespace_impls::ConnectBuilder; /// use lance::dataset::DatasetBuilder; /// - /// // Connect to a REST namespace /// let namespace = ConnectBuilder::new("rest") /// .property("uri", "http://localhost:8080") /// .connect() /// .await?; /// - /// // Load a dataset using storage options from namespace /// let dataset = DatasetBuilder::from_namespace( - /// namespace.clone(), + /// namespace, /// vec!["my_table".to_string()], /// false, /// ) /// .await? /// .load() /// .await?; + /// ``` /// - /// // Load a dataset ignoring namespace storage options - /// let dataset = DatasetBuilder::from_namespace( + /// # Example: With custom storage options provider + /// + /// If you need to override the auto-created storage options provider: + /// + /// ```ignore + /// let dataset = DatasetBuilder::from_namespace(namespace, table_id, false) + /// .await? + /// .with_storage_options_provider(my_custom_provider) + /// .load() + /// .await?; + /// ``` + /// + /// # Using URI with namespace credentials + /// + /// If you need to use a specific URI with namespace-based credential refresh, + /// use [`from_uri`](Self::from_uri) with + /// [`with_storage_options_provider`](Self::with_storage_options_provider) instead: + /// + /// ```ignore + /// use std::sync::Arc; + /// use lance_io::object_store::LanceNamespaceStorageOptionsProvider; + /// + /// let provider = Arc::new(LanceNamespaceStorageOptionsProvider::new( /// namespace, /// vec!["my_table".to_string()], - /// true, - /// ) - /// .await? - /// .load() - /// .await?; + /// )); + /// + /// let dataset = DatasetBuilder::from_uri("s3://bucket/table.lance") + /// .with_storage_options_provider(provider) + /// .load() + /// .await?; /// ``` pub async fn from_namespace( namespace: Arc, @@ -165,8 +228,11 @@ impl DatasetBuilder { builder.storage_options_override = namespace_storage_options.clone(); - if namespace_storage_options.is_some() { - builder.options.storage_options_provider = Some(Arc::new( + if let Some(ref initial_opts) = namespace_storage_options { + // Set initial options from namespace + builder.storage_options = initial_opts.clone(); + // Set provider for credential refresh + builder.storage_options_provider = Some(Arc::new( LanceNamespaceStorageOptionsProvider::new(namespace, table_id), )); } @@ -290,7 +356,7 @@ impl DatasetBuilder { /// - [S3 options](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html#variants) /// - [Google options](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html#variants) pub fn with_storage_options(mut self, storage_options: HashMap) -> Self { - self.options.storage_options = Some(storage_options); + self.storage_options.extend(storage_options); self } @@ -302,9 +368,8 @@ impl DatasetBuilder { /// .with_storage_option("region", "us-east-1"); /// ``` pub fn with_storage_option(mut self, key: impl AsRef, value: impl AsRef) -> Self { - let mut storage_options = self.options.storage_options.unwrap_or_default(); - storage_options.insert(key.as_ref().to_string(), value.as_ref().to_string()); - self.options.storage_options = Some(storage_options); + self.storage_options + .insert(key.as_ref().to_string(), value.as_ref().to_string()); self } @@ -356,7 +421,16 @@ impl DatasetBuilder { mut self, provider: Arc, ) -> Self { - self.options.storage_options_provider = Some(provider); + self.storage_options_provider = Some(provider); + self + } + + /// Set the entire [ObjectStoreParams] configuration. + /// + /// This is useful when you want to propagate all store configuration + /// (storage options, provider, credentials, etc.) from one context to another. + pub fn with_store_params(mut self, params: ObjectStoreParams) -> Self { + self.options = params; self } @@ -417,13 +491,8 @@ impl DatasetBuilder { None => commit_handler_from_url(&self.table_uri, &Some(self.options.clone())).await, }?; - let storage_options = self - .options - .storage_options - .clone() - .map(StorageOptions::new) - .unwrap_or_default(); - let download_retry_count = storage_options.download_retry_count(); + let accessor = self.options.accessor(); + let download_retry_count = accessor.download_retry_count().await?; let store_registry = self .session @@ -479,14 +548,35 @@ impl DatasetBuilder { } async fn load_impl(mut self) -> Result { - // Apply storage_options_override last to ensure namespace options take precedence + // Build the final storage options by merging user options with overrides + let mut final_storage_options = self.storage_options.clone(); if let Some(override_opts) = self.storage_options_override.take() { - let mut merged_opts = self.options.storage_options.clone().unwrap_or_default(); - // Override with namespace storage options - they take precedence - merged_opts.extend(override_opts); - self.options.storage_options = Some(merged_opts); + // Override options take precedence (e.g., from namespace) + final_storage_options.extend(override_opts); } + // Construct the StorageOptionsAccessor at build time + let accessor = + if final_storage_options.is_empty() && self.storage_options_provider.is_none() { + None + } else { + Some(Arc::new(match self.storage_options_provider.take() { + Some(provider) if final_storage_options.is_empty() => { + StorageOptionsAccessor::new_with_provider( + provider, + self.options.s3_credentials_refresh_offset, + ) + } + Some(provider) => StorageOptionsAccessor::new_with_options_and_provider( + final_storage_options, + provider, + self.options.s3_credentials_refresh_offset, + ), + None => StorageOptionsAccessor::new_with_options(final_storage_options), + })) + }; + self.options.storage_options_accessor = accessor; + let session = match self.session.as_ref() { Some(session) => session.clone(), None => Arc::new(Session::new( diff --git a/rust/lance/src/dataset/fragment/write.rs b/rust/lance/src/dataset/fragment/write.rs index 0ab7dd15d21..7be4c7b2d5d 100644 --- a/rust/lance/src/dataset/fragment/write.rs +++ b/rust/lance/src/dataset/fragment/write.rs @@ -129,10 +129,11 @@ impl<'a> FragmentCreateBuilder<'a> { Self::validate_schema(&schema, stream.schema().as_ref())?; + let store_params = params.store_params.clone().unwrap_or_default(); let (object_store, base_path) = ObjectStore::from_uri_and_params( params.store_registry(), self.dataset_uri, - ¶ms.store_params.clone().unwrap_or_default(), + &store_params, ) .await?; let data_file_key = generate_random_filename(); @@ -214,10 +215,11 @@ impl<'a> FragmentCreateBuilder<'a> { Self::validate_schema(&schema, stream.schema().as_ref())?; let version = params.data_storage_version.unwrap_or_default(); + let store_params = params.store_params.clone().unwrap_or_default(); let (object_store, base_path) = ObjectStore::from_uri_and_params( params.store_registry(), self.dataset_uri, - ¶ms.store_params.clone().unwrap_or_default(), + &store_params, ) .await?; do_write_fragments( @@ -251,10 +253,11 @@ impl<'a> FragmentCreateBuilder<'a> { Self::validate_schema(&schema, stream.schema().as_ref())?; + let store_params = params.store_params.clone().unwrap_or_default(); let (object_store, base_path) = ObjectStore::from_uri_and_params( params.store_registry(), self.dataset_uri, - ¶ms.store_params.clone().unwrap_or_default(), + &store_params, ) .await?; let filename = format!("{}.lance", generate_random_filename()); @@ -303,12 +306,8 @@ impl<'a> FragmentCreateBuilder<'a> { async fn existing_dataset_schema(&self) -> Result> { let mut builder = DatasetBuilder::from_uri(self.dataset_uri); - let storage_options = self - .write_params - .and_then(|p| p.store_params.as_ref()) - .and_then(|p| p.storage_options.clone()); - if let Some(storage_options) = storage_options { - builder = builder.with_storage_options(storage_options); + if let Some(store_params) = self.write_params.and_then(|p| p.store_params.as_ref()) { + builder = builder.with_store_params(store_params.clone()); } match builder.load().await { Ok(dataset) => { diff --git a/rust/lance/src/dataset/write/commit.rs b/rust/lance/src/dataset/write/commit.rs index f5ac12d0559..7f54e6ab987 100644 --- a/rust/lance/src/dataset/write/commit.rs +++ b/rust/lance/src/dataset/write/commit.rs @@ -207,12 +207,9 @@ impl<'a> CommitBuilder<'a> { ObjectStore::extract_path_from_uri(session.store_registry(), uri)?, ) } else { - ObjectStore::from_uri_and_params( - session.store_registry(), - uri, - &self.store_params.clone().unwrap_or_default(), - ) - .await? + let store_params = self.store_params.clone().unwrap_or_default(); + ObjectStore::from_uri_and_params(session.store_registry(), uri, &store_params) + .await? }; (object_store, base_path, commit_handler) } diff --git a/rust/lance/src/dataset/write/insert.rs b/rust/lance/src/dataset/write/insert.rs index d1ee6db4145..8693c7bf853 100644 --- a/rust/lance/src/dataset/write/insert.rs +++ b/rust/lance/src/dataset/write/insert.rs @@ -357,18 +357,12 @@ impl<'a> InsertBuilder<'a> { .as_ref() .map(|s| s.store_registry()) .unwrap_or_else(|| Arc::new(Default::default())); - let (object_store, base_path) = ObjectStore::from_uri_and_params( - registry, - uri, - ¶ms.store_params.clone().unwrap_or_default(), - ) - .await?; - let commit_handler = resolve_commit_handler( - uri, - params.commit_handler.clone(), - ¶ms.store_params, - ) - .await?; + let store_params = params.store_params.clone().unwrap_or_default(); + let (object_store, base_path) = + ObjectStore::from_uri_and_params(registry, uri, &store_params).await?; + let commit_handler = + resolve_commit_handler(uri, params.commit_handler.clone(), &Some(store_params)) + .await?; (object_store, base_path, commit_handler) } }; From 1a721dfdc1904581f1e37dc192413d21d460ac9e Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Fri, 19 Dec 2025 00:54:35 -0800 Subject: [PATCH 2/2] refactor2 --- java/lance-jni/src/blocking_dataset.rs | 11 +- java/src/main/java/org/lance/Dataset.java | 6 +- python/python/lance/dataset.py | 4 +- python/src/dataset.rs | 4 +- rust/lance-io/src/object_store.rs | 18 +- rust/lance-io/src/object_store/providers.rs | 35 ++-- .../src/object_store/providers/aws.rs | 24 ++- .../src/object_store/providers/azure.rs | 53 ++--- .../src/object_store/providers/gcp.rs | 7 +- .../src/object_store/providers/huggingface.rs | 12 +- .../src/object_store/providers/local.rs | 14 +- .../src/object_store/providers/memory.rs | 12 +- .../src/object_store/storage_options.rs | 187 ++++++++++++------ rust/lance-io/src/scheduler.rs | 15 +- rust/lance/src/dataset.rs | 43 ++-- rust/lance/src/dataset/builder.rs | 2 +- 16 files changed, 252 insertions(+), 195 deletions(-) diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index 1357b6b36ba..655ffa0d1ea 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -1567,24 +1567,21 @@ fn inner_get_config<'local>( } #[no_mangle] -pub extern "system" fn Java_org_lance_Dataset_nativeGetCurrentStorageOptions<'local>( +pub extern "system" fn Java_org_lance_Dataset_nativeGetStorageOptions<'local>( mut env: JNIEnv<'local>, java_dataset: JObject, ) -> JObject<'local> { - ok_or_throw!( - env, - inner_get_current_storage_options(&mut env, java_dataset) - ) + ok_or_throw!(env, inner_get_storage_options(&mut env, java_dataset)) } -fn inner_get_current_storage_options<'local>( +fn inner_get_storage_options<'local>( env: &mut JNIEnv<'local>, java_dataset: JObject, ) -> Result> { let options = { let dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; - RT.block_on(dataset_guard.inner.current_storage_options()) + RT.block_on(dataset_guard.inner.storage_options()) .map_err(|e| Error::io_error(e.to_string()))? }; diff --git a/java/src/main/java/org/lance/Dataset.java b/java/src/main/java/org/lance/Dataset.java index a53a3a39291..2c5af82f174 100644 --- a/java/src/main/java/org/lance/Dataset.java +++ b/java/src/main/java/org/lance/Dataset.java @@ -1004,14 +1004,14 @@ public Map getConfig() { * * @return the current storage options */ - public Map getCurrentStorageOptions() { + public Map getStorageOptions() { try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) { Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed"); - return nativeGetCurrentStorageOptions(); + return nativeGetStorageOptions(); } } - private native Map nativeGetCurrentStorageOptions(); + private native Map nativeGetStorageOptions(); /** * Compact the dataset to improve performance. diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 041d8628203..fd28aa672a6 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -951,7 +951,7 @@ def max_field_id(self) -> int: """ return self._ds.max_field_id - def current_storage_options(self) -> Dict[str, str]: + def storage_options(self) -> Dict[str, str]: """Get the current storage options. Returns a dictionary combining the initial storage_options with any @@ -964,7 +964,7 @@ def current_storage_options(self) -> Dict[str, str]: If neither storage_options nor storage_options_provider were specified when opening the dataset, an empty dictionary is returned. """ - return self._ds.current_storage_options() + return self._ds.storage_options() def to_table( self, diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 2cd5927be5f..8f514c97c73 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -650,8 +650,8 @@ impl Dataset { /// Returns a dictionary combining the initial storage_options with any /// overrides from the storage_options_provider. Provider options take /// precedence when keys conflict. Options are cached until they expire. - fn current_storage_options(&self) -> PyResult> { - rt().block_on(None, self.ds.current_storage_options())? + fn storage_options(&self) -> PyResult> { + rt().block_on(None, self.ds.storage_options())? .map_err(|err| PyIOError::new_err(err.to_string())) } diff --git a/rust/lance-io/src/object_store.rs b/rust/lance-io/src/object_store.rs index 4b6694097c9..f7eacfada8d 100644 --- a/rust/lance-io/src/object_store.rs +++ b/rust/lance-io/src/object_store.rs @@ -289,16 +289,6 @@ impl ObjectStoreParams { .clone() .unwrap_or_else(|| Arc::new(StorageOptionsAccessor::new_with_options(HashMap::new()))) } - - /// Get a reference to the initial storage options (synchronous, no provider refresh). - /// - /// This is primarily used for synchronous operations that need to access - /// storage options without going through the async provider refresh mechanism. - pub fn storage_options_ref(&self) -> Option<&HashMap> { - self.storage_options_accessor - .as_ref() - .and_then(|a| a.initial_options()) - } } /// Convert a URI string or local path to a URL @@ -424,8 +414,8 @@ impl ObjectStore { #[allow(deprecated)] if let Some((store, path)) = params.object_store.as_ref() { let mut inner = store.clone(); - let store_prefix = - registry.calculate_object_store_prefix(uri, params.storage_options_ref())?; + let accessor = params.accessor(); + let store_prefix = registry.calculate_object_store_prefix(uri, &accessor)?; if let Some(wrapper) = params.object_store_wrapper.as_ref() { inner = wrapper.wrap(&store_prefix, inner); } @@ -792,7 +782,7 @@ impl ObjectStore { list_is_lexically_ordered: bool, io_parallelism: usize, download_retry_count: usize, - storage_options: Option<&HashMap>, + accessor: &StorageOptionsAccessor, ) -> Self { let scheme = location.scheme(); let block_size = block_size.unwrap_or_else(|| infer_block_size(scheme)); @@ -800,7 +790,7 @@ impl ObjectStore { let store = match wrapper { Some(wrapper) => { let store_prefix = DEFAULT_OBJECT_STORE_REGISTRY - .calculate_object_store_prefix(location.as_ref(), storage_options) + .calculate_object_store_prefix(location.as_ref(), accessor) .unwrap(); wrapper.wrap(&store_prefix, store) } diff --git a/rust/lance-io/src/object_store/providers.rs b/rust/lance-io/src/object_store/providers.rs index 319375f186e..f5445e9f584 100644 --- a/rust/lance-io/src/object_store/providers.rs +++ b/rust/lance-io/src/object_store/providers.rs @@ -13,7 +13,9 @@ use url::Url; use crate::object_store::uri_to_url; use crate::object_store::WrappingObjectStore; -use super::{tracing::ObjectStoreTracingExt, ObjectStore, ObjectStoreParams}; +use super::{ + tracing::ObjectStoreTracingExt, ObjectStore, ObjectStoreParams, StorageOptionsAccessor, +}; use lance_core::error::{Error, LanceOptionExt, Result}; #[cfg(feature = "aws")] @@ -61,7 +63,7 @@ pub trait ObjectStoreProvider: std::fmt::Debug + Sync + Send { fn calculate_object_store_prefix( &self, url: &Url, - _storage_options: Option<&HashMap>, + _accessor: &StorageOptionsAccessor, ) -> Result { Ok(format!("{}${}", url.scheme(), url.authority())) } @@ -171,8 +173,8 @@ impl ObjectStoreRegistry { return Err(self.scheme_not_found_error(scheme)); }; - let cache_path = - provider.calculate_object_store_prefix(&base_path, params.storage_options_ref())?; + let accessor = params.accessor(); + let cache_path = provider.calculate_object_store_prefix(&base_path, &accessor)?; let cache_key = (cache_path.clone(), params.clone()); // Check if we have a cached store for this base path and params @@ -225,12 +227,12 @@ impl ObjectStoreRegistry { Ok(store) } - /// Calculate the datastore prefix based on the URI and the storage options. + /// Calculate the datastore prefix based on the URI and the accessor. /// The data store prefix should uniquely identify the datastore. pub fn calculate_object_store_prefix( &self, uri: &str, - storage_options: Option<&HashMap>, + accessor: &StorageOptionsAccessor, ) -> Result { let url = uri_to_url(uri)?; match self.get_provider(url.scheme()) { @@ -241,7 +243,7 @@ impl ObjectStoreRegistry { Err(self.scheme_not_found_error(url.scheme())) } } - Some(provider) => provider.calculate_object_store_prefix(&url, storage_options), + Some(provider) => provider.calculate_object_store_prefix(&url, accessor), } } } @@ -298,6 +300,8 @@ impl ObjectStoreRegistry { mod tests { use super::*; + use crate::object_store::StorageOptionsAccessor; + #[derive(Debug)] struct DummyProvider; @@ -316,9 +320,12 @@ mod tests { fn test_calculate_object_store_prefix() { let provider = DummyProvider; let url = Url::parse("dummy://blah/path").unwrap(); + let accessor = StorageOptionsAccessor::new_with_options(HashMap::new()); assert_eq!( "dummy$blah", - provider.calculate_object_store_prefix(&url, None).unwrap() + provider + .calculate_object_store_prefix(&url, &accessor) + .unwrap() ); } @@ -326,9 +333,10 @@ mod tests { fn test_calculate_object_store_scheme_not_found() { let registry = ObjectStoreRegistry::empty(); registry.insert("dummy", Arc::new(DummyProvider)); + let accessor = StorageOptionsAccessor::new_with_options(HashMap::new()); let s = "Invalid user input: No object store provider found for scheme: 'dummy2'\nValid schemes: dummy"; let result = registry - .calculate_object_store_prefix("dummy2://mybucket/my/long/path", None) + .calculate_object_store_prefix("dummy2://mybucket/my/long/path", &accessor) .expect_err("expected error") .to_string(); assert_eq!(s, &result[..s.len()]); @@ -338,10 +346,11 @@ mod tests { #[test] fn test_calculate_object_store_prefix_for_local() { let registry = ObjectStoreRegistry::empty(); + let accessor = StorageOptionsAccessor::new_with_options(HashMap::new()); assert_eq!( "file", registry - .calculate_object_store_prefix("/tmp/foobar", None) + .calculate_object_store_prefix("/tmp/foobar", &accessor) .unwrap() ); } @@ -350,10 +359,11 @@ mod tests { #[test] fn test_calculate_object_store_prefix_for_local_windows_path() { let registry = ObjectStoreRegistry::empty(); + let accessor = StorageOptionsAccessor::new_with_options(HashMap::new()); assert_eq!( "file", registry - .calculate_object_store_prefix("c://dos/path", None) + .calculate_object_store_prefix("c://dos/path", &accessor) .unwrap() ); } @@ -363,10 +373,11 @@ mod tests { fn test_calculate_object_store_prefix_for_dummy_path() { let registry = ObjectStoreRegistry::empty(); registry.insert("dummy", Arc::new(DummyProvider)); + let accessor = StorageOptionsAccessor::new_with_options(HashMap::new()); assert_eq!( "dummy$mybucket", registry - .calculate_object_store_prefix("dummy://mybucket/my/long/path", None) + .calculate_object_store_prefix("dummy://mybucket/my/long/path", &accessor) .unwrap() ); } diff --git a/rust/lance-io/src/object_store/providers/aws.rs b/rust/lance-io/src/object_store/providers/aws.rs index aacedadf34d..35df337036b 100644 --- a/rust/lance-io/src/object_store/providers/aws.rs +++ b/rust/lance-io/src/object_store/providers/aws.rs @@ -42,10 +42,11 @@ impl AwsStoreProvider { &self, base_path: &mut Url, params: &ObjectStoreParams, - accessor: &Arc, - storage_options: &HashMap, is_s3_express: bool, ) -> Result> { + let accessor = params.accessor(); + let storage_options = accessor.get_s3_storage_options().await?; + let max_retries = accessor.client_max_retries().await?; let retry_timeout = accessor.client_retry_timeout().await?; let retry_config = RetryConfig { @@ -54,7 +55,7 @@ impl AwsStoreProvider { retry_timeout: Duration::from_secs(retry_timeout), }; - let mut s3_storage_options = StorageOptionsAccessor::as_s3_options(storage_options); + let mut s3_storage_options = StorageOptionsAccessor::as_s3_options(&storage_options); let region = resolve_s3_region(base_path, &s3_storage_options).await?; let storage_options_provider = accessor.provider(); let expires_at_millis = storage_options @@ -139,8 +140,10 @@ impl ObjectStoreProvider for AwsStoreProvider { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); let accessor = params.accessor(); - // Get merged storage options with env vars applied - let storage_options = accessor.get_options_with_s3_env().await?; + // Apply S3 environment variables to the accessor once + accessor.apply_s3_env().await; + + // Get configuration from merged options let download_retry_count = accessor.download_retry_count().await?; let use_opendal = accessor.s3_use_opendal().await?; let is_s3_express_opt = accessor.s3_express().await?; @@ -151,18 +154,13 @@ impl ObjectStoreProvider for AwsStoreProvider { let inner = if use_opendal { // Use OpenDAL implementation + let storage_options = accessor.get_s3_storage_options().await?; self.build_opendal_s3_store(&base_path, &storage_options) .await? } else { // Use default Amazon S3 implementation - self.build_amazon_s3_store( - &mut base_path, - params, - &accessor, - &storage_options, - is_s3_express, - ) - .await? + self.build_amazon_s3_store(&mut base_path, params, is_s3_express) + .await? }; Ok(ObjectStore { diff --git a/rust/lance-io/src/object_store/providers/azure.rs b/rust/lance-io/src/object_store/providers/azure.rs index 696be85e0d1..96f092f235b 100644 --- a/rust/lance-io/src/object_store/providers/azure.rs +++ b/rust/lance-io/src/object_store/providers/azure.rs @@ -93,8 +93,11 @@ impl ObjectStoreProvider for AzureBlobStoreProvider { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); let accessor = params.accessor(); - // Get merged storage options with Azure env vars applied - let storage_options = accessor.get_options_with_azure_env().await?; + // Apply Azure environment variables to the accessor once + accessor.apply_azure_env().await; + + // Get configuration from merged options + let storage_options = accessor.get_azure_storage_options().await?; let download_retry_count = accessor.download_retry_count().await?; let use_opendal = accessor.azure_use_opendal().await?; @@ -122,7 +125,7 @@ impl ObjectStoreProvider for AzureBlobStoreProvider { fn calculate_object_store_prefix( &self, url: &Url, - storage_options: Option<&HashMap>, + accessor: &StorageOptionsAccessor, ) -> Result { let authority = url.authority(); let (container, account) = match authority.find("@") { @@ -139,17 +142,15 @@ impl ObjectStoreProvider for AzureBlobStoreProvider { None => { // The URI looks like 'az://container/path-part/file'. // We must look at the storage options to find the account. - let mut account = match storage_options { - Some(opts) => StorageOptionsAccessor::find_configured_storage_account(opts), - None => None, - }; - if account.is_none() { - account = StorageOptionsAccessor::find_configured_storage_account(&ENV_OPTIONS); - } - let account = account.ok_or(Error::invalid_input( - "Unable to find object store prefix: no Azure account name in URI, and no storage account configured.", - location!(), - ))?; + let account = accessor + .find_azure_storage_account() + .or_else(|| { + StorageOptionsAccessor::find_configured_storage_account(&ENV_OPTIONS) + }) + .ok_or(Error::invalid_input( + "Unable to find object store prefix: no Azure account name in URI, and no storage account configured.", + location!(), + ))?; (authority, account) } }; @@ -226,13 +227,16 @@ mod tests { #[test] fn test_calculate_object_store_prefix_from_url_and_options() { let provider = AzureBlobStoreProvider; - let options = HashMap::from_iter([("account_name".to_string(), "bob".to_string())]); + let accessor = StorageOptionsAccessor::new_with_options(HashMap::from_iter([( + "account_name".to_string(), + "bob".to_string(), + )])); assert_eq!( "az$container@bob", provider .calculate_object_store_prefix( &Url::parse("az://container/path").unwrap(), - Some(&options) + &accessor ) .unwrap() ); @@ -241,13 +245,16 @@ mod tests { #[test] fn test_calculate_object_store_prefix_from_url_and_ignored_options() { let provider = AzureBlobStoreProvider; - let options = HashMap::from_iter([("account_name".to_string(), "bob".to_string())]); + let accessor = StorageOptionsAccessor::new_with_options(HashMap::from_iter([( + "account_name".to_string(), + "bob".to_string(), + )])); assert_eq!( "az$container@account", provider .calculate_object_store_prefix( &Url::parse("az://container@account.dfs.core.windows.net/path").unwrap(), - Some(&options) + &accessor ) .unwrap() ); @@ -256,13 +263,13 @@ mod tests { #[test] fn test_fail_to_calculate_object_store_prefix_from_url() { let provider = AzureBlobStoreProvider; - let options = HashMap::from_iter([("access_key".to_string(), "myaccesskey".to_string())]); + let accessor = StorageOptionsAccessor::new_with_options(HashMap::from_iter([( + "access_key".to_string(), + "myaccesskey".to_string(), + )])); let expected = "Invalid user input: Unable to find object store prefix: no Azure account name in URI, and no storage account configured."; let result = provider - .calculate_object_store_prefix( - &Url::parse("az://container/path").unwrap(), - Some(&options), - ) + .calculate_object_store_prefix(&Url::parse("az://container/path").unwrap(), &accessor) .expect_err("expected error") .to_string(); assert_eq!(expected, &result[..expected.len()]); diff --git a/rust/lance-io/src/object_store/providers/gcp.rs b/rust/lance-io/src/object_store/providers/gcp.rs index 9c20112909a..4b9be7c4feb 100644 --- a/rust/lance-io/src/object_store/providers/gcp.rs +++ b/rust/lance-io/src/object_store/providers/gcp.rs @@ -96,8 +96,11 @@ impl ObjectStoreProvider for GcsStoreProvider { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); let accessor = params.accessor(); - // Get merged storage options with GCS env vars applied - let storage_options = accessor.get_options_with_gcs_env().await?; + // Apply GCS environment variables to the accessor once + accessor.apply_gcs_env().await; + + // Get configuration from merged options + let storage_options = accessor.get_gcs_storage_options().await?; let download_retry_count = accessor.download_retry_count().await?; let use_opendal = accessor.gcs_use_opendal().await?; diff --git a/rust/lance-io/src/object_store/providers/huggingface.rs b/rust/lance-io/src/object_store/providers/huggingface.rs index 833783f8541..a37861f225a 100644 --- a/rust/lance-io/src/object_store/providers/huggingface.rs +++ b/rust/lance-io/src/object_store/providers/huggingface.rs @@ -13,8 +13,8 @@ use url::Url; use crate::object_store::parse_hf_repo_id; use crate::object_store::{ - ObjectStore, ObjectStoreParams, ObjectStoreProvider, DEFAULT_CLOUD_BLOCK_SIZE, - DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, + ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptionsAccessor, + DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, }; use lance_core::error::{Error, Result}; @@ -125,7 +125,7 @@ impl ObjectStoreProvider for HuggingfaceStoreProvider { fn calculate_object_store_prefix( &self, url: &Url, - _storage_options: Option<&HashMap>, + _accessor: &StorageOptionsAccessor, ) -> Result { let repo_id = parse_hf_repo_id(url)?; Ok(format!("{}${}@{}", url.scheme(), url.authority(), repo_id)) @@ -135,7 +135,6 @@ impl ObjectStoreProvider for HuggingfaceStoreProvider { #[cfg(test)] mod tests { use super::*; - use crate::object_store::StorageOptionsAccessor; #[test] fn parse_basic_url() { @@ -220,7 +219,10 @@ mod tests { fn calculate_prefix_uses_repo_id() { let provider = HuggingfaceStoreProvider; let url = Url::parse("hf://datasets/acme/repo/path").unwrap(); - let prefix = provider.calculate_object_store_prefix(&url, None).unwrap(); + let accessor = StorageOptionsAccessor::new_with_options(HashMap::new()); + let prefix = provider + .calculate_object_store_prefix(&url, &accessor) + .unwrap(); assert_eq!(prefix, "hf$datasets@acme/repo"); } diff --git a/rust/lance-io/src/object_store/providers/local.rs b/rust/lance-io/src/object_store/providers/local.rs index 05b1d7ef91e..9e95872321c 100644 --- a/rust/lance-io/src/object_store/providers/local.rs +++ b/rust/lance-io/src/object_store/providers/local.rs @@ -1,11 +1,11 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::{collections::HashMap, sync::Arc}; +use std::sync::Arc; use crate::object_store::{ - ObjectStore, ObjectStoreParams, ObjectStoreProvider, DEFAULT_LOCAL_BLOCK_SIZE, - DEFAULT_LOCAL_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, + ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptionsAccessor, + DEFAULT_LOCAL_BLOCK_SIZE, DEFAULT_LOCAL_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, }; use lance_core::error::Result; use lance_core::Error; @@ -53,7 +53,7 @@ impl ObjectStoreProvider for FileStoreProvider { fn calculate_object_store_prefix( &self, url: &Url, - _storage_options: Option<&HashMap>, + _accessor: &StorageOptionsAccessor, ) -> Result { Ok(url.scheme().to_string()) } @@ -86,10 +86,11 @@ mod tests { #[test] fn test_calculate_object_store_prefix() { let provider = FileStoreProvider; + let accessor = StorageOptionsAccessor::new_with_options(std::collections::HashMap::new()); assert_eq!( "file", provider - .calculate_object_store_prefix(&Url::parse("file:///etc").unwrap(), None) + .calculate_object_store_prefix(&Url::parse("file:///etc").unwrap(), &accessor) .unwrap() ); } @@ -97,12 +98,13 @@ mod tests { #[test] fn test_calculate_object_store_prefix_for_file_object_store() { let provider = FileStoreProvider; + let accessor = StorageOptionsAccessor::new_with_options(std::collections::HashMap::new()); assert_eq!( "file-object-store", provider .calculate_object_store_prefix( &Url::parse("file-object-store:///etc").unwrap(), - None + &accessor ) .unwrap() ); diff --git a/rust/lance-io/src/object_store/providers/memory.rs b/rust/lance-io/src/object_store/providers/memory.rs index 4a0f53558b1..a6958e6ae00 100644 --- a/rust/lance-io/src/object_store/providers/memory.rs +++ b/rust/lance-io/src/object_store/providers/memory.rs @@ -1,11 +1,11 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::{collections::HashMap, sync::Arc}; +use std::sync::Arc; use crate::object_store::{ - ObjectStore, ObjectStoreParams, ObjectStoreProvider, DEFAULT_CLOUD_IO_PARALLELISM, - DEFAULT_LOCAL_BLOCK_SIZE, DEFAULT_MAX_IOP_SIZE, + ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptionsAccessor, + DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_LOCAL_BLOCK_SIZE, DEFAULT_MAX_IOP_SIZE, }; use lance_core::error::Result; use object_store::{memory::InMemory, path::Path}; @@ -46,7 +46,7 @@ impl ObjectStoreProvider for MemoryStoreProvider { fn calculate_object_store_prefix( &self, _url: &Url, - _storage_options: Option<&HashMap>, + _accessor: &StorageOptionsAccessor, ) -> Result { Ok("memory".to_string()) } @@ -55,6 +55,7 @@ impl ObjectStoreProvider for MemoryStoreProvider { #[cfg(test)] mod tests { use super::*; + use std::collections::HashMap; #[test] fn test_memory_store_path() { @@ -69,10 +70,11 @@ mod tests { #[test] fn test_calculate_object_store_prefix() { let provider = MemoryStoreProvider; + let accessor = StorageOptionsAccessor::new_with_options(HashMap::new()); assert_eq!( "memory", provider - .calculate_object_store_prefix(&Url::parse("memory://etc").unwrap(), None) + .calculate_object_store_prefix(&Url::parse("memory://etc").unwrap(), &accessor) .unwrap() ); } diff --git a/rust/lance-io/src/object_store/storage_options.rs b/rust/lance-io/src/object_store/storage_options.rs index f5c165587dd..d674ef03d58 100644 --- a/rust/lance-io/src/object_store/storage_options.rs +++ b/rust/lance-io/src/object_store/storage_options.rs @@ -191,7 +191,8 @@ struct CachedProviderOptions { /// Thread-safe with double-check locking pattern to prevent concurrent refreshes. pub struct StorageOptionsAccessor { /// Initial/static storage options (always available immediately) - initial_options: Option>, + /// Wrapped in RwLock to allow applying environment variables + initial_options: RwLock>>, /// Optional dynamic provider for refreshable options provider: Option>, /// Cache for provider options (only used when provider is present) @@ -204,8 +205,14 @@ pub struct StorageOptionsAccessor { impl fmt::Debug for StorageOptionsAccessor { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // Use try_read to avoid blocking in Debug + let has_options = self + .initial_options + .try_read() + .map(|guard| guard.is_some()) + .unwrap_or(false); f.debug_struct("StorageOptionsAccessor") - .field("initial_options", &self.initial_options.is_some()) + .field("initial_options", &has_options) .field("provider", &self.provider) .field("refresh_offset", &self.refresh_offset) .finish() @@ -213,10 +220,12 @@ impl fmt::Debug for StorageOptionsAccessor { } // Hash is based on initial_options and provider's provider_id +// Note: This uses blocking read, so should only be called from sync context impl std::hash::Hash for StorageOptionsAccessor { fn hash(&self, state: &mut H) { // Hash initial options in a deterministic order - if let Some(opts) = &self.initial_options { + // Use blocking_read since Hash trait is sync + if let Some(opts) = self.initial_options.blocking_read().as_ref() { let mut sorted_keys: Vec<_> = opts.keys().collect(); sorted_keys.sort(); for key in sorted_keys { @@ -233,7 +242,10 @@ impl std::hash::Hash for StorageOptionsAccessor { impl PartialEq for StorageOptionsAccessor { fn eq(&self, other: &Self) -> bool { - self.initial_options == other.initial_options + // Use blocking_read since PartialEq trait is sync + let self_opts = self.initial_options.blocking_read(); + let other_opts = other.initial_options.blocking_read(); + *self_opts == *other_opts && self.provider.as_ref().map(|p| p.provider_id()) == other.provider.as_ref().map(|p| p.provider_id()) } @@ -248,7 +260,7 @@ impl StorageOptionsAccessor { /// * `options` - Static storage options pub fn new_with_options(options: HashMap) -> Self { Self { - initial_options: Some(options), + initial_options: RwLock::new(Some(options)), provider: None, provider_cache: Arc::new(RwLock::new(None)), refresh_offset: Duration::from_secs(60), @@ -266,7 +278,7 @@ impl StorageOptionsAccessor { refresh_offset: Duration, ) -> Self { Self { - initial_options: None, + initial_options: RwLock::new(None), provider: Some(provider), provider_cache: Arc::new(RwLock::new(None)), refresh_offset, @@ -288,7 +300,7 @@ impl StorageOptionsAccessor { refresh_offset: Duration, ) -> Self { Self { - initial_options: Some(initial_options), + initial_options: RwLock::new(Some(initial_options)), provider: Some(provider), provider_cache: Arc::new(RwLock::new(None)), refresh_offset, @@ -314,7 +326,7 @@ impl StorageOptionsAccessor { .get(EXPIRES_AT_MILLIS_KEY) .and_then(|v| v.parse().ok()); Self { - initial_options, + initial_options: RwLock::new(initial_options), provider: Some(provider), provider_cache: Arc::new(RwLock::new(Some(CachedProviderOptions { options: initial_provider_options, @@ -336,23 +348,24 @@ impl StorageOptionsAccessor { /// Get current storage options with version number. /// The version number increments on each provider refresh, allowing callers to detect changes. pub async fn get_options_with_version(&self) -> Result<(Option>, u64)> { + let initial_opts = self.initial_options.read().await.clone(); + // If no provider, just return initial options with version 0 if self.provider.is_none() { - return Ok((self.initial_options.clone(), 0)); + return Ok((initial_opts, 0)); } // Get provider options (with caching) let (provider_opts, version) = self.get_provider_options_with_version().await?; // Merge: start with initial, extend with provider (provider takes precedence) - let merged = match (&self.initial_options, provider_opts) { + let merged = match (initial_opts, provider_opts) { (None, None) => None, - (Some(initial), None) => Some(initial.clone()), + (Some(initial), None) => Some(initial), (None, Some(provider)) => Some(provider), - (Some(initial), Some(provider)) => { - let mut merged = initial.clone(); - merged.extend(provider); - Some(merged) + (Some(mut initial), Some(provider)) => { + initial.extend(provider); + Some(initial) } }; @@ -465,9 +478,16 @@ impl StorageOptionsAccessor { self.provider.is_some() } - /// Get the initial options (without provider options) - pub fn initial_options(&self) -> Option<&HashMap> { - self.initial_options.as_ref() + /// Find the configured Azure storage account name from initial options (sync) + /// + /// This checks the initial_options for Azure account name configuration. + /// Falls back to environment variables if not found in options. + #[cfg(feature = "azure")] + pub fn find_azure_storage_account(&self) -> Option { + self.initial_options + .blocking_read() + .as_ref() + .and_then(Self::find_configured_storage_account) } /// Get the provider, if present @@ -489,7 +509,7 @@ impl StorageOptionsAccessor { refresh_offset: Duration, ) -> Self { Self { - initial_options: Some(new_initial_options), + initial_options: RwLock::new(Some(new_initial_options)), provider: self.provider.clone(), // Create new cache - we don't copy the old cache since credentials // should be fresh from the provider @@ -514,10 +534,14 @@ impl StorageOptionsAccessor { provider: Arc, refresh_offset: Duration, ) -> Self { - let mut merged = self.initial_options.clone().unwrap_or_default(); + let mut merged = self + .initial_options + .blocking_read() + .clone() + .unwrap_or_default(); merged.extend(additional_options); Self { - initial_options: Some(merged), + initial_options: RwLock::new(Some(merged)), provider: Some(provider), provider_cache: Arc::new(RwLock::new(None)), refresh_offset, @@ -525,6 +549,43 @@ impl StorageOptionsAccessor { } } + /// Apply S3 environment variables to initial options + /// + /// This mutates the accessor's internal state to include S3-specific environment variables. + /// Should be called once when the accessor is first used for S3 operations. + #[cfg(feature = "aws")] + pub async fn apply_s3_env(&self) { + let mut opts = self.initial_options.write().await; + let options = opts.get_or_insert_with(HashMap::new); + Self::apply_env_overrides(options); + Self::apply_env_s3(options); + } + + /// Apply Azure environment variables to initial options + /// + /// This mutates the accessor's internal state to include Azure-specific environment variables. + /// Should be called once when the accessor is first used for Azure operations. + #[cfg(feature = "azure")] + pub async fn apply_azure_env(&self) { + let mut opts = self.initial_options.write().await; + let options = opts.get_or_insert_with(HashMap::new); + Self::apply_env_overrides(options); + let env_options = Self::from_env_azure(); + Self::apply_env_azure(options, &env_options); + } + + /// Apply GCS environment variables to initial options + /// + /// This mutates the accessor's internal state to include GCS-specific environment variables. + /// Should be called once when the accessor is first used for GCS operations. + #[cfg(feature = "gcp")] + pub async fn apply_gcs_env(&self) { + let mut opts = self.initial_options.write().await; + let options = opts.get_or_insert_with(HashMap::new); + Self::apply_env_overrides(options); + Self::apply_env_gcs(options); + } + // ========================================================================= // Async helper methods for extracting configuration from merged options // These use get_options() to get merged/refreshed options from the provider @@ -561,11 +622,6 @@ impl StorageOptionsAccessor { Ok(opts.and_then(|o| o.get(key).cloned())) } - /// Get a value from initial options by key (synchronous, no provider refresh) - pub fn get_initial(&self, key: &str) -> Option<&String> { - self.initial_options.as_ref().and_then(|opts| opts.get(key)) - } - /// Denotes if unsecure connections via http are allowed (async, uses merged options) pub async fn allow_http(&self) -> Result { let opts = self.get_options().await?; @@ -702,31 +758,29 @@ impl StorageOptionsAccessor { Ok(Self::as_s3_options(&opts)) } - /// Get merged storage options with S3 env vars applied (async, refreshes if needed) + /// Get merged storage options for S3 (async, refreshes if needed) /// - /// This method: - /// 1. Gets merged options from initial + provider - /// 2. Applies common env overrides - /// 3. Applies S3-specific env vars + /// Note: Call `apply_s3_env()` once before using this method to ensure + /// environment variables are applied to the accessor. #[cfg(feature = "aws")] - pub async fn get_options_with_s3_env(&self) -> Result> { - let mut opts = self.get_options().await?.unwrap_or_default(); - Self::apply_env_overrides(&mut opts); - Self::apply_env_s3(&mut opts); - Ok(opts) + pub async fn get_s3_storage_options(&self) -> Result> { + Ok(self.get_options().await?.unwrap_or_default()) } - /// Get merged storage options as S3-specific options map with env vars applied + /// Get merged storage options as S3-specific options map + /// + /// Note: Call `apply_s3_env()` once before using this method to ensure + /// environment variables are applied to the accessor. #[cfg(feature = "aws")] pub async fn get_s3_options_with_env(&self) -> Result> { - let opts = self.get_options_with_s3_env().await?; + let opts = self.get_options().await?.unwrap_or_default(); Ok(Self::as_s3_options(&opts)) } /// Check if use_opendal flag is set for S3 #[cfg(feature = "aws")] pub async fn s3_use_opendal(&self) -> Result { - let opts = self.get_options_with_s3_env().await?; + let opts = self.get_options().await?.unwrap_or_default(); Ok(opts .get("use_opendal") .map(|v| v == "true") @@ -736,14 +790,14 @@ impl StorageOptionsAccessor { /// Check if S3 Express mode is enabled #[cfg(feature = "aws")] pub async fn s3_express(&self) -> Result { - let opts = self.get_options_with_s3_env().await?; + let opts = self.get_options().await?.unwrap_or_default(); Ok(opts.get("s3_express").map(|v| v == "true").unwrap_or(false)) } /// Check if this is Cloudflare R2 (requires constant size upload parts) #[cfg(feature = "aws")] pub async fn is_cloudflare_r2(&self) -> Result { - let opts = self.get_options_with_s3_env().await?; + let opts = self.get_options().await?.unwrap_or_default(); Ok(opts .get("aws_endpoint") .map(|endpoint| endpoint.contains("r2.cloudflarestorage.com")) @@ -800,27 +854,29 @@ impl StorageOptionsAccessor { Ok(Self::as_azure_options(&opts)) } - /// Get merged storage options with Azure env vars applied (async, refreshes if needed) + /// Get merged storage options for Azure (async, refreshes if needed) + /// + /// Note: Call `apply_azure_env()` once before using this method to ensure + /// environment variables are applied to the accessor. #[cfg(feature = "azure")] - pub async fn get_options_with_azure_env(&self) -> Result> { - let env_options = Self::from_env_azure(); - let mut opts = self.get_options().await?.unwrap_or_default(); - Self::apply_env_overrides(&mut opts); - Self::apply_env_azure(&mut opts, &env_options); - Ok(opts) + pub async fn get_azure_storage_options(&self) -> Result> { + Ok(self.get_options().await?.unwrap_or_default()) } - /// Get merged storage options as Azure-specific options map with env vars applied + /// Get merged storage options as Azure-specific options map + /// + /// Note: Call `apply_azure_env()` once before using this method to ensure + /// environment variables are applied to the accessor. #[cfg(feature = "azure")] pub async fn get_azure_options_with_env(&self) -> Result> { - let opts = self.get_options_with_azure_env().await?; + let opts = self.get_options().await?.unwrap_or_default(); Ok(Self::as_azure_options(&opts)) } /// Check if use_opendal flag is set for Azure #[cfg(feature = "azure")] pub async fn azure_use_opendal(&self) -> Result { - let opts = self.get_options_with_azure_env().await?; + let opts = self.get_options().await?.unwrap_or_default(); Ok(opts .get("use_opendal") .map(|v| v == "true") @@ -840,10 +896,10 @@ impl StorageOptionsAccessor { } } - /// Get the configured Azure storage account name (async, uses merged options with env) + /// Get the configured Azure storage account name (async, uses merged options) #[cfg(feature = "azure")] pub async fn get_azure_storage_account(&self) -> Result> { - let opts = self.get_options_with_azure_env().await?; + let opts = self.get_options().await?.unwrap_or_default(); Ok(Self::find_configured_storage_account(&opts)) } @@ -891,36 +947,39 @@ impl StorageOptionsAccessor { Ok(Self::as_gcs_options(&opts)) } - /// Get merged storage options with GCS env vars applied (async, refreshes if needed) + /// Get merged storage options for GCS (async, refreshes if needed) + /// + /// Note: Call `apply_gcs_env()` once before using this method to ensure + /// environment variables are applied to the accessor. #[cfg(feature = "gcp")] - pub async fn get_options_with_gcs_env(&self) -> Result> { - let mut opts = self.get_options().await?.unwrap_or_default(); - Self::apply_env_overrides(&mut opts); - Self::apply_env_gcs(&mut opts); - Ok(opts) + pub async fn get_gcs_storage_options(&self) -> Result> { + Ok(self.get_options().await?.unwrap_or_default()) } - /// Get merged storage options as GCS-specific options map with env vars applied + /// Get merged storage options as GCS-specific options map + /// + /// Note: Call `apply_gcs_env()` once before using this method to ensure + /// environment variables are applied to the accessor. #[cfg(feature = "gcp")] pub async fn get_gcs_options_with_env(&self) -> Result> { - let opts = self.get_options_with_gcs_env().await?; + let opts = self.get_options().await?.unwrap_or_default(); Ok(Self::as_gcs_options(&opts)) } /// Check if use_opendal flag is set for GCS #[cfg(feature = "gcp")] pub async fn gcs_use_opendal(&self) -> Result { - let opts = self.get_options_with_gcs_env().await?; + let opts = self.get_options().await?.unwrap_or_default(); Ok(opts .get("use_opendal") .map(|v| v == "true") .unwrap_or(false)) } - /// Get Google storage token (async, uses merged options with env) + /// Get Google storage token (async, uses merged options) #[cfg(feature = "gcp")] pub async fn google_storage_token(&self) -> Result> { - let opts = self.get_options_with_gcs_env().await?; + let opts = self.get_options().await?.unwrap_or_default(); Ok(opts.get("google_storage_token").cloned()) } diff --git a/rust/lance-io/src/scheduler.rs b/rust/lance-io/src/scheduler.rs index a3591940cef..28efd66183b 100644 --- a/rust/lance-io/src/scheduler.rs +++ b/rust/lance-io/src/scheduler.rs @@ -964,7 +964,10 @@ impl FileScheduler { #[cfg(test)] mod tests { - use std::{collections::VecDeque, time::Duration}; + use std::{ + collections::{HashMap, VecDeque}, + time::Duration, + }; use futures::poll; use lance_core::utils::tempfile::TempObjFile; @@ -975,7 +978,9 @@ mod tests { use url::Url; use crate::{ - object_store::{DEFAULT_DOWNLOAD_RETRY_COUNT, DEFAULT_MAX_IOP_SIZE}, + object_store::{ + StorageOptionsAccessor, DEFAULT_DOWNLOAD_RETRY_COUNT, DEFAULT_MAX_IOP_SIZE, + }, testing::MockObjectStore, }; @@ -1136,6 +1141,7 @@ mod tests { } .boxed() }); + let accessor = StorageOptionsAccessor::new_with_options(HashMap::new()); let obj_store = Arc::new(ObjectStore::new( Arc::new(obj_store), Url::parse("mem://").unwrap(), @@ -1145,7 +1151,7 @@ mod tests { false, 1, DEFAULT_DOWNLOAD_RETRY_COUNT, - None, + &accessor, )); let config = SchedulerConfig { @@ -1226,6 +1232,7 @@ mod tests { let base_store = base_store.clone(); async move { base_store.get_opts(&location, options).await }.boxed() }); + let accessor = StorageOptionsAccessor::new_with_options(HashMap::new()); let obj_store = Arc::new(ObjectStore::new( Arc::new(obj_store), Url::parse("mem://").unwrap(), @@ -1235,7 +1242,7 @@ mod tests { false, 1, DEFAULT_DOWNLOAD_RETRY_COUNT, - None, + &accessor, )); let config = SchedulerConfig { diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index d2c01a13a0c..1005c1284b3 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -937,19 +937,7 @@ impl Dataset { // assumes no dataset exists and converts the mode to CREATE. let mut builder = DatasetBuilder::from_uri(uri.as_str()); if let Some(ref store_params) = write_params.store_params { - // Copy storage_options_accessor from store_params if present - // The accessor already contains both the initial options and provider - if let Some(ref accessor) = store_params.storage_options_accessor { - // Get initial storage options from the accessor - let initial_options = accessor.initial_options(); - if let Some(opts) = initial_options { - builder = builder.with_storage_options(opts.clone()); - } - // If the accessor has a provider, set that too - if let Some(provider) = accessor.provider() { - builder = builder.with_storage_options_provider(provider); - } - } + builder = builder.with_store_params(store_params.clone()); } let dataset = Arc::new(builder.load().await?); @@ -1609,24 +1597,6 @@ impl Dataset { &self.object_store } - /// Returns the storage options used when opening this dataset, if any. - /// Returns the initial storage options (without provider options). - pub fn storage_options(&self) -> Option<&HashMap> { - self.store_params - .as_ref() - .and_then(|p| p.storage_options_accessor.as_ref()) - .and_then(|a| a.initial_options()) - } - - /// Returns the storage options accessor used when opening this dataset, if any. - pub fn storage_options_accessor( - &self, - ) -> Option> { - self.store_params - .as_ref() - .and_then(|p| p.storage_options_accessor.clone()) - } - /// Returns the current storage options, combining the initial storage_options /// with any overrides from the storage_options_provider (cached until expiration). /// @@ -1636,7 +1606,7 @@ impl Dataset { /// /// If neither storage_options nor storage_options_provider were specified when /// opening the dataset, an empty HashMap is returned. - pub async fn current_storage_options(&self) -> Result> { + pub async fn storage_options(&self) -> Result> { if let Some(accessor) = self.storage_options_accessor() { Ok(accessor.get_options().await?.unwrap_or_default()) } else { @@ -1644,6 +1614,15 @@ impl Dataset { } } + /// Returns the storage options accessor used when opening this dataset, if any. + pub fn storage_options_accessor( + &self, + ) -> Option> { + self.store_params + .as_ref() + .and_then(|p| p.storage_options_accessor.clone()) + } + pub fn data_dir(&self) -> Path { self.base.child(DATA_DIR) } diff --git a/rust/lance/src/dataset/builder.rs b/rust/lance/src/dataset/builder.rs index 1d47f1db56e..0ffca415fb6 100644 --- a/rust/lance/src/dataset/builder.rs +++ b/rust/lance/src/dataset/builder.rs @@ -514,7 +514,7 @@ impl DatasetBuilder { // cloud-like DEFAULT_CLOUD_IO_PARALLELISM, download_retry_count, - None, // No storage_options available here + &accessor, )), Path::from(store.1.path()), commit_handler,