diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 741539891597..b5b1ae1a32df 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -21,7 +21,7 @@ version = "0.3.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" -description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage and Azure Blob Storage" +description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files." keywords = [ "object", "storage", @@ -77,4 +77,4 @@ aws = ["rusoto_core", "rusoto_credential", "rusoto_s3", "rusoto_sts", "hyper", " [dev-dependencies] # In alphabetical order dotenv = "0.15.0" tempfile = "3.1.0" -futures-test = "0.3" +futures-test = "0.3" \ No newline at end of file diff --git a/object_store/README.md b/object_store/README.md index 313588b4a73b..fd10414a9285 100644 --- a/object_store/README.md +++ b/object_store/README.md @@ -19,8 +19,21 @@ # Rust Object Store -A crate providing a generic interface to object stores, such as S3, Azure Blob Storage and Google Cloud Storage. +A focused, easy to use, idiomatic, high performance, `async` object +store library interacting with object stores. -Originally developed for [InfluxDB IOx](https://github.com/influxdata/influxdb_iox/) and later split out and donated to Apache Arrow. +Using this crate, the same binary and code can easily run in multiple +clouds and local test environments, via a simple runtime configuration +change. Supported object stores include: + +* [AWS S3](https://aws.amazon.com/s3/) +* [Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/) +* [Google Cloud Storage](https://cloud.google.com/storage) +* Local files +* Memory +* Custom implementations + + +Originally developed for [InfluxDB IOx](https://github.com/influxdata/influxdb_iox/) and later split out and donated to [Apache Arrow](https://arrow.apache.org/). See [docs.rs](https://docs.rs/object_store) for usage instructions diff --git a/object_store/src/aws.rs b/object_store/src/aws.rs index cedd4651e540..d59f48bcefe0 100644 --- a/object_store/src/aws.rs +++ b/object_store/src/aws.rs @@ -260,7 +260,7 @@ impl From for super::Error { } } -/// Configuration for connecting to [Amazon S3](https://aws.amazon.com/s3/). +/// Interface for [Amazon S3](https://aws.amazon.com/s3/). pub struct AmazonS3 { /// S3 client w/o any connection limit. /// @@ -599,7 +599,8 @@ fn convert_object_meta(object: rusoto_s3::Object, bucket: &str) -> Result for super::Error { } } -/// Configuration for connecting to [Microsoft Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/). +/// Interface for [Microsoft Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/). #[derive(Debug)] pub struct MicrosoftAzure { container_client: Arc, @@ -587,7 +587,8 @@ fn url_from_env(env_name: &str, default_url: &str) -> Result { /// # let ACCOUNT = "foo"; /// # let BUCKET_NAME = "foo"; /// # let ACCESS_KEY = "foo"; -/// let azure = object_store::azure::MicrosoftAzureBuilder::new() +/// # use object_store::azure::MicrosoftAzureBuilder; +/// let azure = MicrosoftAzureBuilder::new() /// .with_account(ACCOUNT) /// .with_access_key(ACCESS_KEY) /// .with_container_name(BUCKET_NAME) diff --git a/object_store/src/gcp.rs b/object_store/src/gcp.rs index dea8769a736b..dd9c84498c1a 100644 --- a/object_store/src/gcp.rs +++ b/object_store/src/gcp.rs @@ -192,7 +192,7 @@ struct CompleteMultipartUpload { parts: Vec, } -/// Configuration for connecting to [Google Cloud Storage](https://cloud.google.com/storage/). +/// Interface for [Google Cloud Storage](https://cloud.google.com/storage/). #[derive(Debug)] pub struct GoogleCloudStorage { client: Arc, @@ -792,7 +792,8 @@ fn reader_credentials_file( /// ``` /// # let BUCKET_NAME = "foo"; /// # let SERVICE_ACCOUNT_PATH = "/tmp/foo.json"; -/// let gcs = object_store::gcp::GoogleCloudStorageBuilder::new() +/// # use object_store::gcp::GoogleCloudStorageBuilder; +/// let gcs = GoogleCloudStorageBuilder::new() /// .with_service_account_path(SERVICE_ACCOUNT_PATH) /// .with_bucket_name(BUCKET_NAME) /// .build(); diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 33e8452d064d..c1d7e3ebd964 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -28,15 +28,129 @@ //! # object_store //! -//! This crate provides APIs for interacting with object storage services. +//! This crate provides a uniform API for interacting with object storage services and +//! local files via the the [`ObjectStore`] trait. //! -//! It currently supports PUT (single or chunked/concurrent), GET, DELETE, HEAD and list for: +//! # Create an [`ObjectStore`] implementation: //! -//! * [Google Cloud Storage](https://cloud.google.com/storage/) -//! * [Amazon S3](https://aws.amazon.com/s3/) -//! * [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/#overview) -//! * In-memory -//! * Local file storage +//! * [Google Cloud Storage](https://cloud.google.com/storage/): [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder) +//! * [Amazon S3](https://aws.amazon.com/s3/): [`AmazonS3Builder`](aws::AmazonS3Builder) +//! * [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/):: [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder) +//! * In Memory: [`InMemory`](memory::InMemory) +//! * Local filesystem: [`LocalFileSystem`](local::LocalFileSystem) +//! +//! # Adapters +//! +//! [`ObjectStore`] instances can be composed with various adapters +//! which add additional functionality: +//! +//! * Rate Throttling: [`ThrottleConfig`](throttle::ThrottleConfig) +//! * Concurrent Request Limit: [`LimitStore`](limit::LimitStore) +//! +//! +//! # Listing objects: +//! +//! Use the [`ObjectStore::list`] method to iterate over objects in +//! remote storage or files in the local filesystem: +//! +//! ``` +//! # use object_store::local::LocalFileSystem; +//! # // use LocalFileSystem for example +//! # fn get_object_store() -> LocalFileSystem { +//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # } +//! +//! # async fn example() { +//! use std::sync::Arc; +//! use object_store::{path::Path, ObjectStore}; +//! use futures::stream::StreamExt; +//! +//! // create an ObjectStore +//! let object_store: Arc = Arc::new(get_object_store()); +//! +//! // Recursively list all files below the 'data' path. +//! // 1. On AWS S3 this would be the 'data/' prefix +//! // 2. On a local filesystem, this would be the 'data' directory +//! let prefix: Path = "data".try_into().unwrap(); +//! +//! // Get an `async` stream of Metadata objects: +//! let list_stream = object_store +//! .list(Some(&prefix)) +//! .await +//! .expect("Error listing files"); +//! +//! // Print a line about each object based on its metadata +//! // using for_each from `StreamExt` trait. +//! list_stream +//! .for_each(move |meta| { +//! async { +//! let meta = meta.expect("Error listing"); +//! println!("Name: {}, size: {}", meta.location, meta.size); +//! } +//! }) +//! .await; +//! # } +//! ``` +//! +//! Which will print out something like the following: +//! +//! ```text +//! Name: data/file01.parquet, size: 112832 +//! Name: data/file02.parquet, size: 143119 +//! Name: data/child/file03.parquet, size: 100 +//! ... +//! ``` +//! +//! # Fetching objects +//! +//! Use the [`ObjectStore::get`] method to fetch the data bytes +//! from remote storage or files in the local filesystem as a stream. +//! +//! ``` +//! # use object_store::local::LocalFileSystem; +//! # // use LocalFileSystem for example +//! # fn get_object_store() -> LocalFileSystem { +//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # } +//! +//! # async fn example() { +//! use std::sync::Arc; +//! use object_store::{path::Path, ObjectStore}; +//! use futures::stream::StreamExt; +//! +//! // create an ObjectStore +//! let object_store: Arc = Arc::new(get_object_store()); +//! +//! // Retrieve a specific file +//! let path: Path = "data/file01.parquet".try_into().unwrap(); +//! +//! // fetch the bytes from object store +//! let stream = object_store +//! .get(&path) +//! .await +//! .unwrap() +//! .into_stream(); +//! +//! // Count the '0's using `map` from `StreamExt` trait +//! let num_zeros = stream +//! .map(|bytes| { +//! let bytes = bytes.unwrap(); +//! bytes.iter().filter(|b| **b == 0).count() +//! }) +//! .collect::>() +//! .await +//! .into_iter() +//! .sum::(); +//! +//! println!("Num zeros in {} is {}", path, num_zeros); +//! # } +//! ``` +//! +//! Which will print out something like the following: +//! +//! ```text +//! Num zeros in data/file01.parquet is 657 +//! ``` //! #[cfg(feature = "aws")]