diff --git a/arrow-array/src/array/binary_array.rs b/arrow-array/src/array/binary_array.rs index 3b13a513f646..a4d64040ceff 100644 --- a/arrow-array/src/array/binary_array.rs +++ b/arrow-array/src/array/binary_array.rs @@ -23,8 +23,7 @@ use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::DataType; -/// See [`BinaryArray`] and [`LargeBinaryArray`] for storing -/// binary data. +/// See [`BinaryArray`] and [`LargeBinaryArray`] for storing binary data pub type GenericBinaryArray = GenericByteArray>; impl GenericBinaryArray { @@ -218,7 +217,8 @@ where } } -/// An array where each element contains 0 or more bytes. +/// An array of `[u8]` using `i32` offsets +/// /// The byte length of each element is represented by an i32. /// /// # Examples @@ -258,8 +258,7 @@ where /// pub type BinaryArray = GenericBinaryArray; -/// An array where each element contains 0 or more bytes. -/// The byte length of each element is represented by an i64. +/// An array of `[u8]` using `i64` offsets /// /// # Examples /// diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index d03f0fd040f2..9ecdb2c5d24d 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -25,7 +25,7 @@ use arrow_schema::DataType; use std::any::Any; use std::sync::Arc; -/// Array of bools +/// An array of [boolean values](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) /// /// # Example /// diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 12f9aab674e8..629ffd22cdc2 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -28,7 +28,7 @@ use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; -/// Generic struct for variable-size byte arrays +/// An array of [variable length byte arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout) /// /// See [`StringArray`] and [`LargeStringArray`] for storing utf8 encoded string data /// diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index 75fd4c6d0d68..a319a836a955 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -30,8 +30,7 @@ use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; -/// -/// A dictionary array where each element is a single value indexed by an integer key. +/// A dictionary array indexed by `i8` /// /// # Example: Using `collect` /// ``` @@ -44,8 +43,8 @@ use std::sync::Arc; /// assert_eq!(array.values(), &values); /// ``` pub type Int8DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. + +/// A dictionary array indexed by `i16` /// /// # Example: Using `collect` /// ``` @@ -58,8 +57,8 @@ pub type Int8DictionaryArray = DictionaryArray; /// assert_eq!(array.values(), &values); /// ``` pub type Int16DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. + +/// A dictionary array indexed by `i32` /// /// # Example: Using `collect` /// ``` @@ -72,8 +71,8 @@ pub type Int16DictionaryArray = DictionaryArray; /// assert_eq!(array.values(), &values); /// ``` pub type Int32DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. + +/// A dictionary array indexed by `i64` /// /// # Example: Using `collect` /// ``` @@ -86,8 +85,8 @@ pub type Int32DictionaryArray = DictionaryArray; /// assert_eq!(array.values(), &values); /// ``` pub type Int64DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. + +/// A dictionary array indexed by `u8` /// /// # Example: Using `collect` /// ``` @@ -100,8 +99,8 @@ pub type Int64DictionaryArray = DictionaryArray; /// assert_eq!(array.values(), &values); /// ``` pub type UInt8DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. + +/// A dictionary array indexed by `u16` /// /// # Example: Using `collect` /// ``` @@ -114,8 +113,8 @@ pub type UInt8DictionaryArray = DictionaryArray; /// assert_eq!(array.values(), &values); /// ``` pub type UInt16DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. + +/// A dictionary array indexed by `u32` /// /// # Example: Using `collect` /// ``` @@ -128,8 +127,8 @@ pub type UInt16DictionaryArray = DictionaryArray; /// assert_eq!(array.values(), &values); /// ``` pub type UInt32DictionaryArray = DictionaryArray; -/// -/// A dictionary array where each element is a single value indexed by an integer key. + +/// A dictionary array indexed by `u64` /// /// # Example: Using `collect` /// ``` @@ -143,7 +142,8 @@ pub type UInt32DictionaryArray = DictionaryArray; /// ``` pub type UInt64DictionaryArray = DictionaryArray; -/// A dictionary array where each element is a single value indexed by an integer key. +/// An array of [dictionary encoded values](https://arrow.apache.org/docs/format/Columnar.html#dictionary-encoded-layout) +/// /// This is mostly used to represent strings or a limited set of primitive types as integers, /// for example when doing NLP analysis or representing chromosomes by name. /// @@ -695,8 +695,9 @@ impl std::fmt::Debug for DictionaryArray { } } -/// A strongly-typed wrapper around a [`DictionaryArray`] that implements [`ArrayAccessor`] -/// allowing fast access to its elements +/// A [`DictionaryArray`] typed on its child values array +/// +/// Implements [`ArrayAccessor`] allowing fast access to its elements /// /// ``` /// use arrow_array::{DictionaryArray, StringArray, types::Int32Type}; diff --git a/arrow-array/src/array/fixed_size_binary_array.rs b/arrow-array/src/array/fixed_size_binary_array.rs index 08ce76c066c3..083d71cd963f 100644 --- a/arrow-array/src/array/fixed_size_binary_array.rs +++ b/arrow-array/src/array/fixed_size_binary_array.rs @@ -25,7 +25,7 @@ use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; -/// An array where each element is a fixed-size sequence of bytes. +/// An array of [fixed size binary arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) /// /// # Examples /// diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 86adafa066f0..18fa9df928ff 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -24,8 +24,7 @@ use arrow_schema::DataType; use std::any::Any; use std::sync::Arc; -/// A list array where each element is a fixed-size sequence of values with the same -/// type whose maximum length is represented by a i32. +/// An array of [fixed size arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-list-layout) /// /// # Example /// @@ -59,9 +58,6 @@ use std::sync::Arc; /// assert_eq!( &[3, 4, 5], list1.as_any().downcast_ref::().unwrap().values()); /// assert_eq!( &[6, 7, 8], list2.as_any().downcast_ref::().unwrap().values()); /// ``` -/// -/// For non generic lists, you may wish to consider using -/// [crate::array::FixedSizeBinaryArray] #[derive(Clone)] pub struct FixedSizeListArray { data_type: DataType, // Must be DataType::FixedSizeList(value_length) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index f4e5b4b79c77..db587f405b88 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -28,7 +28,7 @@ use num::Integer; use std::any::Any; use std::sync::Arc; -/// trait declaring an offset size, relevant for i32 vs i64 array types. +/// A type that can be used within a variable-size array to encode offset information pub trait OffsetSizeTrait: ArrowNativeType + std::ops::AddAssign + Integer { /// True for 64 bit offset size and false for 32 bit offset size const IS_LARGE: bool; @@ -46,12 +46,9 @@ impl OffsetSizeTrait for i64 { const PREFIX: &'static str = "Large"; } -/// Generic struct for a variable-size list array. +/// An array of [variable length arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout) /// -/// Columnar format in Apache Arrow: -/// -/// -/// For non generic lists, you may wish to consider using [`ListArray`] or [`LargeListArray`]` +/// See [`ListArray`] and [`LargeListArray`]` pub struct GenericListArray { data_type: DataType, nulls: Option, @@ -447,8 +444,7 @@ impl std::fmt::Debug for GenericListArray std::fmt::Debug for GenericListArray; -/// A list array where each element is a variable-sized sequence of values with the same -/// type whose memory offsets between elements are represented by a i64. +/// An array of variable size arrays, storing offsets as `i64`. +/// /// # Example /// /// ``` diff --git a/arrow-array/src/array/map_array.rs b/arrow-array/src/array/map_array.rs index c53e452a67dd..cf0978f05b4e 100644 --- a/arrow-array/src/array/map_array.rs +++ b/arrow-array/src/array/map_array.rs @@ -23,7 +23,8 @@ use arrow_schema::{ArrowError, DataType, Field}; use std::any::Any; use std::sync::Arc; -/// A nested array type where each record is a key-value map. +/// An array of key-value maps +/// /// Keys should always be non-null, but values can be null. /// /// [MapArray] is physically a [crate::array::ListArray] that has a diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index e6fd6828bac7..9312770644a3 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -67,8 +67,7 @@ pub use union_array::*; mod run_array; pub use run_array::*; -/// Trait for dealing with different types of array at runtime when the type of the -/// array is not known in advance. +/// An array in the [arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html) pub trait Array: std::fmt::Debug + Send + Sync { /// Returns the array as [`Any`](std::any::Any) so that it can be /// downcasted to a specific implementation. @@ -237,7 +236,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { fn get_array_memory_size(&self) -> usize; } -/// A reference-counted reference to a generic `Array`. +/// A reference-counted reference to a generic `Array` pub type ArrayRef = Arc; /// Ergonomics: Allow use of an ArrayRef as an `&dyn Array` diff --git a/arrow-array/src/array/null_array.rs b/arrow-array/src/array/null_array.rs index c7f61d91da70..7fdd99a39675 100644 --- a/arrow-array/src/array/null_array.rs +++ b/arrow-array/src/array/null_array.rs @@ -24,7 +24,7 @@ use arrow_schema::DataType; use std::any::Any; use std::sync::Arc; -/// An Array where all elements are nulls +/// An array of [null values](https://arrow.apache.org/docs/format/Columnar.html#null-layout) /// /// A `NullArray` is a simplified array where all values are null. /// diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 8c8562b5be38..8c3fab237a55 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -34,6 +34,7 @@ use half::f16; use std::any::Any; use std::sync::Arc; +/// An array of `i8` /// /// # Example: Using `collect` /// ``` @@ -41,6 +42,8 @@ use std::sync::Arc; /// let arr : Int8Array = [Some(1), Some(2)].into_iter().collect(); /// ``` pub type Int8Array = PrimitiveArray; + +/// An array of `i16` /// /// # Example: Using `collect` /// ``` @@ -48,6 +51,8 @@ pub type Int8Array = PrimitiveArray; /// let arr : Int16Array = [Some(1), Some(2)].into_iter().collect(); /// ``` pub type Int16Array = PrimitiveArray; + +/// An array of `i32` /// /// # Example: Using `collect` /// ``` @@ -55,6 +60,8 @@ pub type Int16Array = PrimitiveArray; /// let arr : Int32Array = [Some(1), Some(2)].into_iter().collect(); /// ``` pub type Int32Array = PrimitiveArray; + +/// An array of `i64` /// /// # Example: Using `collect` /// ``` @@ -62,13 +69,16 @@ pub type Int32Array = PrimitiveArray; /// let arr : Int64Array = [Some(1), Some(2)].into_iter().collect(); /// ``` pub type Int64Array = PrimitiveArray; -/// + +/// An array of `u8` /// # Example: Using `collect` /// ``` /// # use arrow_array::UInt8Array; /// let arr : UInt8Array = [Some(1), Some(2)].into_iter().collect(); /// ``` pub type UInt8Array = PrimitiveArray; + +/// An array of `u16` /// /// # Example: Using `collect` /// ``` @@ -76,6 +86,8 @@ pub type UInt8Array = PrimitiveArray; /// let arr : UInt16Array = [Some(1), Some(2)].into_iter().collect(); /// ``` pub type UInt16Array = PrimitiveArray; + +/// An array of `u32` /// /// # Example: Using `collect` /// ``` @@ -83,6 +95,8 @@ pub type UInt16Array = PrimitiveArray; /// let arr : UInt32Array = [Some(1), Some(2)].into_iter().collect(); /// ``` pub type UInt32Array = PrimitiveArray; + +/// An array of `u64` /// /// # Example: Using `collect` /// ``` @@ -90,6 +104,8 @@ pub type UInt32Array = PrimitiveArray; /// let arr : UInt64Array = [Some(1), Some(2)].into_iter().collect(); /// ``` pub type UInt64Array = PrimitiveArray; + +/// An array of `f16` /// /// # Example: Using `collect` /// ``` @@ -98,6 +114,8 @@ pub type UInt64Array = PrimitiveArray; /// let arr : Float16Array = [Some(f16::from_f64(1.0)), Some(f16::from_f64(2.0))].into_iter().collect(); /// ``` pub type Float16Array = PrimitiveArray; + +/// An array of `f32` /// /// # Example: Using `collect` /// ``` @@ -105,6 +123,8 @@ pub type Float16Array = PrimitiveArray; /// let arr : Float32Array = [Some(1.0), Some(2.0)].into_iter().collect(); /// ``` pub type Float32Array = PrimitiveArray; + +/// An array of `f64` /// /// # Example: Using `collect` /// ``` @@ -113,8 +133,12 @@ pub type Float32Array = PrimitiveArray; /// ``` pub type Float64Array = PrimitiveArray; +/// An array of 64-bit values representing the elapsed time +/// since UNIX epoch in seconds +/// +/// This type is similar to the [`chrono::DateTime`] type and can hold +/// values such as `1970-05-09 14:25:11 +01:00` /// -/// A primitive array where each element is of type [TimestampSecondType]. /// See also [`Timestamp`](arrow_schema::DataType::Timestamp). /// /// # Example: UTC timestamps post epoch @@ -157,82 +181,99 @@ pub type Float64Array = PrimitiveArray; /// ``` /// pub type TimestampSecondArray = PrimitiveArray; -/// A primitive array where each element is of type `TimestampMillisecondType.` -/// See examples for [`TimestampSecondArray.`](crate::array::TimestampSecondArray) + +/// An array of 64-bit values representing the elapsed time +/// since UNIX epoch in milliseconds +/// +/// See examples for [`TimestampSecondArray`] pub type TimestampMillisecondArray = PrimitiveArray; -/// A primitive array where each element is of type `TimestampMicrosecondType.` -/// See examples for [`TimestampSecondArray.`](crate::array::TimestampSecondArray) + +/// An array of 64-bit values representing the elapsed time +/// since UNIX epoch in microseconds +/// +/// See examples for [`TimestampSecondArray`] pub type TimestampMicrosecondArray = PrimitiveArray; -/// A primitive array where each element is of type `TimestampNanosecondType.` -/// See examples for [`TimestampSecondArray.`](crate::array::TimestampSecondArray) + +/// An array of 64-bit values representing the elapsed time +/// since UNIX epoch in nanoseconds +/// +/// See examples for [`TimestampSecondArray`] pub type TimestampNanosecondArray = PrimitiveArray; // TODO: give examples for the below types -/// A primitive array where each element is of 32-bit value -/// representing the elapsed time since UNIX epoch in days." +/// An array of 32-bit values representing the elapsed time +/// since UNIX epoch in days /// /// This type is similar to the [`chrono::NaiveDate`] type and can hold /// values such as `2018-11-13` pub type Date32Array = PrimitiveArray; -/// A primitive array where each element is a 64-bit value -/// representing the elapsed time since the UNIX epoch in milliseconds. + +/// An array of 64-bit values representing the elapsed time +/// since UNIX epoch in milliseconds /// -/// This type is similar to the [`chrono::NaiveDateTime`] type and can hold -/// values such as `2018-11-13T17:11:10.011` +/// This type is similar to the [`chrono::NaiveDate`] type and can hold +/// values such as `2018-11-13` pub type Date64Array = PrimitiveArray; -/// An array where each element is of 32-bit type representing time elapsed in seconds -/// since midnight. +/// An array of 32-bit values representing the elapsed time +/// since midnight in seconds /// /// This type is similar to the [`chrono::NaiveTime`] type and can /// hold values such as `00:02:00` pub type Time32SecondArray = PrimitiveArray; -/// An array where each element is of 32-bit type representing time elapsed in milliseconds -/// since midnight. + +/// An array of 32-bit values representing the elapsed time +/// since midnight in milliseconds /// /// This type is similar to the [`chrono::NaiveTime`] type and can /// hold values such as `00:02:00.123` pub type Time32MillisecondArray = PrimitiveArray; -/// An array where each element is of 64-bit type representing time elapsed in microseconds -/// since midnight. + +/// An array of 64-bit values representing the elapsed time +/// since midnight in microseconds /// /// This type is similar to the [`chrono::NaiveTime`] type and can /// hold values such as `00:02:00.123456` pub type Time64MicrosecondArray = PrimitiveArray; -/// An array where each element is of 64-bit type representing time elapsed in nanoseconds -/// since midnight. + +/// An array of 64-bit values representing the elapsed time +/// since midnight in nanoseconds /// /// This type is similar to the [`chrono::NaiveTime`] type and can /// hold values such as `00:02:00.123456789` pub type Time64NanosecondArray = PrimitiveArray; -/// An array where each element is a “calendar” interval in months. +/// An array of “calendar” intervals in months pub type IntervalYearMonthArray = PrimitiveArray; -/// An array where each element is a “calendar” interval days and milliseconds. + +/// An array of “calendar” intervals in days and milliseconds pub type IntervalDayTimeArray = PrimitiveArray; -/// An array where each element is a “calendar” interval in months, days, and nanoseconds. + +/// An array of “calendar” intervals in months, days, and nanoseconds pub type IntervalMonthDayNanoArray = PrimitiveArray; -/// An array where each element is an elapsed time type in seconds. +/// An array of elapsed durations in seconds pub type DurationSecondArray = PrimitiveArray; -/// An array where each element is an elapsed time type in milliseconds. + +/// An array of elapsed durations in milliseconds pub type DurationMillisecondArray = PrimitiveArray; -/// An array where each element is an elapsed time type in microseconds. + +/// An array of elapsed durations in microseconds pub type DurationMicrosecondArray = PrimitiveArray; -/// An array where each element is an elapsed time type in nanoseconds. + +/// An array of elapsed durations in nanoseconds pub type DurationNanosecondArray = PrimitiveArray; -/// An array where each element is a 128-bits decimal with precision in [1, 38] and -/// scale less or equal to 38. +/// An array of 128-bit fixed point decimals pub type Decimal128Array = PrimitiveArray; -/// An array where each element is a 256-bits decimal with precision in [1, 76] and -/// scale less or equal to 76. + +/// An array of 256-bit fixed point decimals pub type Decimal256Array = PrimitiveArray; pub use crate::types::ArrowPrimitiveType; -/// Array whose elements are of primitive types. +/// An array of [primitive values](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) /// /// # Example: From an iterator of values /// @@ -890,6 +931,8 @@ impl<'a, T: ArrowPrimitiveType> PrimitiveArray { } } +/// An optional primitive value +/// /// This struct is used as an adapter when creating `PrimitiveArray` from an iterator. /// `FromIterator` for `PrimitiveArray` takes an iterator where the elements can be `into` /// this struct. So once implementing `From` or `Into` trait for a type, an iterator of diff --git a/arrow-array/src/array/run_array.rs b/arrow-array/src/array/run_array.rs index e7e71d3840bb..820d5c9ebfc1 100644 --- a/arrow-array/src/array/run_array.rs +++ b/arrow-array/src/array/run_array.rs @@ -30,10 +30,10 @@ use crate::{ Array, ArrayAccessor, ArrayRef, PrimitiveArray, }; +/// An array of [run-end encoded values](https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout) /// -/// A run-end encoding (REE) is a variation of [run-length encoding (RLE)](https://en.wikipedia.org/wiki/Run-length_encoding). -/// -/// This encoding is good for representing data containing same values repeated consecutively. +/// This encoding is variation on [run-length encoding (RLE)](https://en.wikipedia.org/wiki/Run-length_encoding) +/// and is good for representing data containing same values repeated consecutively. /// /// [`RunArray`] contains `run_ends` array and `values` array of same length. /// The `run_ends` array stores the indexes at which the run ends. The `values` array @@ -428,7 +428,7 @@ impl<'a, T: RunEndIndexType> FromIterator<&'a str> for RunArray { } /// -/// A [`RunArray`] array where run ends are stored using `i16` data type. +/// A [`RunArray`] with `i16` run ends /// /// # Example: Using `collect` /// ``` @@ -443,7 +443,7 @@ impl<'a, T: RunEndIndexType> FromIterator<&'a str> for RunArray { pub type Int16RunArray = RunArray; /// -/// A [`RunArray`] array where run ends are stored using `i32` data type. +/// A [`RunArray`] with `i32` run ends /// /// # Example: Using `collect` /// ``` @@ -458,7 +458,7 @@ pub type Int16RunArray = RunArray; pub type Int32RunArray = RunArray; /// -/// A [`RunArray`] array where run ends are stored using `i64` data type. +/// A [`RunArray`] with `i64` run ends /// /// # Example: Using `collect` /// ``` @@ -472,8 +472,9 @@ pub type Int32RunArray = RunArray; /// ``` pub type Int64RunArray = RunArray; -/// A strongly-typed wrapper around a [`RunArray`] that implements [`ArrayAccessor`] -/// and [`IntoIterator`] allowing fast access to its elements +/// A [`RunArray`] typed typed on its child values array +/// +/// Implements [`ArrayAccessor`] and [`IntoIterator`] allowing fast access to its elements /// /// ``` /// use arrow_array::{RunArray, StringArray, types::Int32Type}; diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs index 7c4a375299db..d8f1c5da16c7 100644 --- a/arrow-array/src/array/string_array.rs +++ b/arrow-array/src/array/string_array.rs @@ -21,10 +21,7 @@ use arrow_buffer::{bit_util, MutableBuffer}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; -/// Generic struct for \[Large\]StringArray -/// -/// See [`StringArray`] and [`LargeStringArray`] for storing -/// specific string data. +/// See [`StringArray`] and [`LargeStringArray`] for storing string data pub type GenericStringArray = GenericByteArray>; impl GenericStringArray { @@ -211,8 +208,7 @@ impl From> for GenericStringArray From> for GenericStringArray; -/// An array where each element is a variable-sized sequence of bytes representing a string -/// whose maximum length (in bytes) is represented by a i64. +/// An array of `str` using `i64` offsets /// /// Example /// diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index fac947f14bfd..78220a518741 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -22,9 +22,9 @@ use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, SchemaBuilder} use std::sync::Arc; use std::{any::Any, ops::Index}; -/// A nested array type where each child (called *field*) is represented by a separate -/// array. +/// An array of [tuples](https://arrow.apache.org/docs/format/Columnar.html#struct-layout) /// +/// Each child (called *field*) is represented by a separate array. /// /// # Comparison with [RecordBatch] /// diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 172ae082197c..74a5f1efa767 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -25,7 +25,7 @@ use arrow_schema::{ArrowError, DataType, Field, UnionFields, UnionMode}; use std::any::Any; use std::sync::Arc; -/// An Array that can represent slots of varying types. +/// An array of [values of varying types](https://arrow.apache.org/docs/format/Columnar.html#union-layout) /// /// Each slot in a [UnionArray] can have a value chosen from a number /// of types. Each of the possible types are named like the fields of diff --git a/arrow-array/src/builder/boolean_buffer_builder.rs b/arrow-array/src/builder/boolean_buffer_builder.rs index f721504d08aa..1a3473e19a04 100644 --- a/arrow-array/src/builder/boolean_buffer_builder.rs +++ b/arrow-array/src/builder/boolean_buffer_builder.rs @@ -19,7 +19,7 @@ use arrow_buffer::{bit_util, BooleanBuffer, Buffer, MutableBuffer}; use arrow_data::bit_mask; use std::ops::Range; -/// A builder for creating a boolean [`Buffer`] +/// Builder for [`BooleanBuffer`] #[derive(Debug)] pub struct BooleanBufferBuilder { buffer: MutableBuffer, diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index c7974967a700..a35e6f6b97e5 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -24,7 +24,7 @@ use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; -/// Array builder for fixed-width primitive types +/// Builder for [`BooleanArray`] /// /// # Example /// diff --git a/arrow-array/src/builder/fixed_size_binary_builder.rs b/arrow-array/src/builder/fixed_size_binary_builder.rs index 695b553f0eee..a354a1db24e1 100644 --- a/arrow-array/src/builder/fixed_size_binary_builder.rs +++ b/arrow-array/src/builder/fixed_size_binary_builder.rs @@ -24,11 +24,11 @@ use arrow_schema::{ArrowError, DataType}; use std::any::Any; use std::sync::Arc; -/// A fixed size binary array builder +/// Builder for [`FixedSizeBinaryArray`] /// ``` -/// use arrow_array::builder::FixedSizeBinaryBuilder; -/// use arrow_array::Array; -/// +/// # use arrow_array::builder::FixedSizeBinaryBuilder; +/// # use arrow_array::Array; +/// # /// let mut builder = FixedSizeBinaryBuilder::with_capacity(3, 5); /// // [b"hello", null, b"arrow"] /// builder.append_value(b"hello").unwrap(); diff --git a/arrow-array/src/builder/fixed_size_list_builder.rs b/arrow-array/src/builder/fixed_size_list_builder.rs index 57af768447c8..ab9fbf5fa63f 100644 --- a/arrow-array/src/builder/fixed_size_list_builder.rs +++ b/arrow-array/src/builder/fixed_size_list_builder.rs @@ -24,7 +24,7 @@ use arrow_schema::{DataType, Field}; use std::any::Any; use std::sync::Arc; -/// Array builder for [`FixedSizeListArray`] +/// Builder for [`FixedSizeListArray`] /// ``` /// use arrow_array::{builder::{Int32Builder, FixedSizeListBuilder}, Array, Int32Array}; /// let values_builder = Int32Builder::new(); diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs b/arrow-array/src/builder/generic_byte_run_builder.rs index 9c26d7be6904..97082fe96673 100644 --- a/arrow-array/src/builder/generic_byte_run_builder.rs +++ b/arrow-array/src/builder/generic_byte_run_builder.rs @@ -30,7 +30,7 @@ use super::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; use arrow_buffer::ArrowNativeType; -/// Array builder for [`RunArray`] for String and Binary types. +/// Builder for [`RunArray`] of [`GenericByteArray`](crate::array::GenericByteArray) /// /// # Example: /// @@ -309,7 +309,7 @@ where } } -/// Array builder for [`RunArray`] that encodes strings ([`Utf8Type`]). +/// Builder for [`RunArray`] of [`StringArray`](crate::array::StringArray) /// /// ``` /// // Create a run-end encoded array with run-end indexes data type as `i16`. @@ -319,7 +319,7 @@ where /// # use arrow_array::{Int16Array, StringArray}; /// # use arrow_array::types::Int16Type; /// # use arrow_array::cast::AsArray; -/// +/// # /// let mut builder = StringRunBuilder::::new(); /// /// // The builder builds the dictionary value by value @@ -342,10 +342,10 @@ where /// ``` pub type StringRunBuilder = GenericByteRunBuilder; -/// Array builder for [`RunArray`] that encodes large strings ([`LargeUtf8Type`]). See [`StringRunBuilder`] for an example. +/// Builder for [`RunArray`] of [`LargeStringArray`](crate::array::LargeStringArray) pub type LargeStringRunBuilder = GenericByteRunBuilder; -/// Array builder for [`RunArray`] that encodes binary values([`BinaryType`]). +/// Builder for [`RunArray`] of [`BinaryArray`](crate::array::BinaryArray) /// /// ``` /// // Create a run-end encoded array with run-end indexes data type as `i16`. @@ -378,8 +378,7 @@ pub type LargeStringRunBuilder = GenericByteRunBuilder; /// ``` pub type BinaryRunBuilder = GenericByteRunBuilder; -/// Array builder for [`RunArray`] that encodes large binary values([`LargeBinaryType`]). -/// See documentation of [`BinaryRunBuilder`] for an example. +/// Builder for [`RunArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray) pub type LargeBinaryRunBuilder = GenericByteRunBuilder; #[cfg(test)] diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index a3598d8bf26d..1887ab36c6d9 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -25,7 +25,7 @@ use std::any::Any; use std::fmt::Write; use std::sync::Arc; -/// Array builder for [`GenericByteArray`] +/// Builder for [`GenericByteArray`] pub struct GenericByteBuilder { value_builder: UInt8BufferBuilder, offsets_builder: BufferBuilder, diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index dd9a70b1d431..d5c62865ff8d 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -27,7 +27,8 @@ use hashbrown::HashMap; use std::any::Any; use std::sync::Arc; -/// Generic array builder for `DictionaryArray` that stores generic byte values. +/// Builder for [`DictionaryArray`] of [`GenericByteArray`] +/// /// For example to map a set of byte indices to String values. Note that /// the use of a `HashMap` here will not scale to very large arrays or /// result in an ordered dictionary. @@ -338,9 +339,7 @@ fn get_bytes(values: &GenericByteBuilder, idx: usize) -> &[ &values[start_offset..end_offset] } -/// Array builder for `DictionaryArray` that stores Strings. For example to map a set of byte indices -/// to String values. Note that the use of a `HashMap` here will not scale to very large -/// arrays or result in an ordered dictionary. +/// Builder for [`DictionaryArray`] of [`StringArray`](crate::array::StringArray) /// /// ``` /// // Create a dictionary array indexed by bytes whose values are Strings. @@ -376,15 +375,11 @@ fn get_bytes(values: &GenericByteBuilder, idx: usize) -> &[ pub type StringDictionaryBuilder = GenericByteDictionaryBuilder>; -/// Array builder for `DictionaryArray` that stores large Strings. For example to map a set of byte indices -/// to String values. Note that the use of a `HashMap` here will not scale to very large -/// arrays or result in an ordered dictionary. +/// Builder for [`DictionaryArray`] of [`LargeStringArray`](crate::array::LargeStringArray) pub type LargeStringDictionaryBuilder = GenericByteDictionaryBuilder>; -/// Array builder for `DictionaryArray` that stores binary. For example to map a set of byte indices -/// to binary values. Note that the use of a `HashMap` here will not scale to very large -/// arrays or result in an ordered dictionary. +/// Builder for [`DictionaryArray`] of [`BinaryArray`](crate::array::BinaryArray) /// /// ``` /// // Create a dictionary array indexed by bytes whose values are binary. @@ -420,9 +415,7 @@ pub type LargeStringDictionaryBuilder = pub type BinaryDictionaryBuilder = GenericByteDictionaryBuilder>; -/// Array builder for `DictionaryArray` that stores large binary. For example to map a set of byte indices -/// to binary values. Note that the use of a `HashMap` here will not scale to very large -/// arrays or result in an ordered dictionary. +/// Builder for [`DictionaryArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray) pub type LargeBinaryDictionaryBuilder = GenericByteDictionaryBuilder>; diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index b6d0707982be..054c87187fbe 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -24,7 +24,7 @@ use arrow_schema::Field; use std::any::Any; use std::sync::Arc; -/// Array builder for [`GenericListArray`]s. +/// Builder for [`GenericListArray`] /// /// Use [`ListBuilder`] to build [`ListArray`]s and [`LargeListBuilder`] to build [`LargeListArray`]s. /// diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index db85465c8d5c..b73e65b117f1 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -24,7 +24,8 @@ use arrow_schema::{ArrowError, DataType, Field}; use std::any::Any; use std::sync::Arc; -/// Creates a new `MapBuilder` +/// Builder for [`MapArray`] +/// /// ``` /// # use arrow_array::builder::{Int32Builder, MapBuilder, StringBuilder}; /// # use arrow_array::{Int32Array, StringArray}; @@ -62,7 +63,7 @@ pub struct MapBuilder { value_builder: V, } -/// Contains details of the mapping +/// The [`Field`] names for a [`MapArray`] #[derive(Debug, Clone)] pub struct MapFieldNames { /// [`Field`] name for map entries diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 081f4d5f41f6..c4f581fbfb46 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Defines builders that can be used to safely build arrays +//! Defines push-based APIs for constructing arrays //! //! # Basic Usage //! @@ -81,7 +81,9 @@ //! # Custom Builders //! //! It is common to have a collection of statically defined Rust types that -//! you want to convert to Arrow arrays. An example of doing so is below +//! you want to convert to Arrow arrays. +//! +//! An example of doing so is below //! //! ``` //! # use std::any::Any; @@ -261,26 +263,20 @@ pub trait ArrayBuilder: Any + Send { fn into_box_any(self: Box) -> Box; } -/// Builder for [`ListArray`]s (i32 offsets) -/// -/// See [`GenericListBuilder`] for usage examples -/// -/// [`ListArray`]: crate::array::ListArray +/// Builder for [`ListArray`](crate::array::ListArray) pub type ListBuilder = GenericListBuilder; -/// Builder for [`LargeListArray`]s (i64 offsets) -/// -/// See [`GenericListBuilder`] for usage examples -/// -/// [`LargeListArray`]: crate::array::LargeListArray +/// Builder for [`LargeListArray`](crate::array::LargeListArray) pub type LargeListBuilder = GenericListBuilder; -/// A binary array builder with i32 offsets +/// Builder for [`BinaryArray`](crate::array::BinaryArray) pub type BinaryBuilder = GenericBinaryBuilder; -/// A binary array builder with i64 offsets + +/// Builder for [`LargeBinaryArray`](crate::array::LargeBinaryArray) pub type LargeBinaryBuilder = GenericBinaryBuilder; -/// A string array builder with i32 offsets +/// Builder for [`StringArray`](crate::array::StringArray) pub type StringBuilder = GenericStringBuilder; -/// A string array builder with i64 offsets + +/// Builder for [`LargeStringArray`](crate::array::LargeStringArray) pub type LargeStringBuilder = GenericStringBuilder; diff --git a/arrow-array/src/builder/primitive_builder.rs b/arrow-array/src/builder/primitive_builder.rs index 6688d07b7055..57d9c48bb170 100644 --- a/arrow-array/src/builder/primitive_builder.rs +++ b/arrow-array/src/builder/primitive_builder.rs @@ -90,7 +90,7 @@ pub type Decimal128Builder = PrimitiveBuilder; /// A decimal 256 array builder pub type Decimal256Builder = PrimitiveBuilder; -/// Array builder for fixed-width primitive types +/// Builder for [`PrimitiveArray`] #[derive(Debug)] pub struct PrimitiveBuilder { values_builder: BufferBuilder, diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index 41880d3a478c..cde1abe22b7b 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -45,9 +45,7 @@ impl PartialEq for Value { impl Eq for Value {} -/// Array builder for `DictionaryArray`. For example to map a set of byte indices -/// to f32 values. Note that the use of a `HashMap` here will not scale to very large -/// arrays or result in an ordered dictionary. +/// Builder for [`DictionaryArray`] of [`PrimitiveArray`](crate::array::PrimitiveArray) /// /// # Example: /// diff --git a/arrow-array/src/builder/primitive_run_builder.rs b/arrow-array/src/builder/primitive_run_builder.rs index 30750b6f3421..53674a73b172 100644 --- a/arrow-array/src/builder/primitive_run_builder.rs +++ b/arrow-array/src/builder/primitive_run_builder.rs @@ -23,7 +23,7 @@ use super::{ArrayBuilder, PrimitiveBuilder}; use arrow_buffer::ArrowNativeType; -/// Array builder for [`RunArray`] that encodes primitive values. +/// Builder for [`RunArray`] of [`PrimitiveArray`](crate::array::PrimitiveArray) /// /// # Example: /// diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index ebffeafcf75f..cc66044aa44d 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -24,7 +24,7 @@ use arrow_schema::{DataType, Fields, IntervalUnit, TimeUnit}; use std::any::Any; use std::sync::Arc; -/// Array builder for Struct types. +/// Builder for [`StructArray`] /// /// Note that callers should make sure that methods of all the child field builders are /// properly called to maintain the consistency of the data structure. diff --git a/arrow-array/src/builder/union_builder.rs b/arrow-array/src/builder/union_builder.rs index 8ca303da8cb4..6461a56aabbe 100644 --- a/arrow-array/src/builder/union_builder.rs +++ b/arrow-array/src/builder/union_builder.rs @@ -99,7 +99,7 @@ impl FieldData { } } -/// Builder type for creating a new `UnionArray`. +/// Builder for [`UnionArray`] /// /// Example: **Dense Memory Layout** /// diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index ff1ddb1f67ce..551f67cb1468 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -19,42 +19,6 @@ //! all having the same type. This crate provides concrete implementations of each type, as //! well as an [`Array`] trait that can be used for type-erasure. //! -//! # Downcasting an Array -//! -//! Arrays are often passed around as a dynamically typed [`&dyn Array`] or [`ArrayRef`]. -//! For example, [`RecordBatch`](`crate::RecordBatch`) stores columns as [`ArrayRef`]. -//! -//! Whilst these arrays can be passed directly to the [`compute`], [`csv`], [`json`], etc... APIs, -//! it is often the case that you wish to interact with the data directly. -//! -//! This requires downcasting to the concrete type of the array: -//! -//! ``` -//! # use arrow_array::{Array, Float32Array, Int32Array}; -//! -//! fn sum_int32(array: &dyn Array) -> i32 { -//! let integers: &Int32Array = array.as_any().downcast_ref().unwrap(); -//! integers.iter().map(|val| val.unwrap_or_default()).sum() -//! } -//! -//! // Note: the values for positions corresponding to nulls will be arbitrary -//! fn as_f32_slice(array: &dyn Array) -> &[f32] { -//! array.as_any().downcast_ref::().unwrap().values() -//! } -//! ``` -//! -//! The [`cast::AsArray`] extension trait can make this more ergonomic -//! -//! ``` -//! # use arrow_array::Array; -//! # use arrow_array::cast::{AsArray, as_primitive_array}; -//! # use arrow_array::types::Float32Type; -//! -//! fn as_f32_slice(array: &dyn Array) -> &[f32] { -//! array.as_primitive::().values() -//! } -//! ``` - //! # Building an Array //! //! Most [`Array`] implementations can be constructed directly from iterators or [`Vec`] @@ -62,7 +26,7 @@ //! ``` //! # use arrow_array::{Int32Array, ListArray, StringArray}; //! # use arrow_array::types::Int32Type; -//! +//! # //! Int32Array::from(vec![1, 2]); //! Int32Array::from(vec![Some(1), None]); //! Int32Array::from_iter([1, 2, 3, 4]); @@ -91,30 +55,59 @@ //! //! // Append a single primitive value //! builder.append_value(1); -//! //! // Append a null value //! builder.append_null(); -//! //! // Append a slice of primitive values //! builder.append_slice(&[2, 3, 4]); //! //! // Build the array //! let array = builder.finish(); //! -//! assert_eq!( -//! 5, -//! array.len(), -//! "The array has 5 values, counting the null value" -//! ); +//! assert_eq!(5, array.len()); +//! assert_eq!(2, array.value(2)); +//! assert_eq!(&array.values()[3..5], &[3, 4]) +//! ``` //! -//! assert_eq!(2, array.value(2), "Get the value with index 2"); +//! # Low-level API +//! +//! Internally, arrays consist of one or more shared memory regions backed by [`Buffer`], +//! the number and meaning of which depend on the array’s data type, as documented in +//! the [Arrow specification]. +//! +//! For example, the type [`Int16Array`] represents an array of 16-bit integers and consists of: +//! +//! * An optional [`NullBuffer`] identifying any null values +//! * A contiguous [`ScalarBuffer`] +//! +//! Similarly, the type [`StringArray`] represents an array of UTF-8 strings and consists of: +//! +//! * An optional [`NullBuffer`] identifying any null values +//! * An offsets [`ScalarBuffer`] identifying valid UTF-8 sequences within the values buffer +//! * A values [`Buffer`] of UTF-8 encoded string data +//! +//! Array constructors such as [`PrimitiveArray::try_new`] provide the ability to cheaply +//! construct an array from these parts, with functions such as [`PrimitiveArray::into_parts`] +//! providing the reverse operation. //! -//! assert_eq!( -//! &array.values()[3..5], -//! &[3, 4], -//! "Get slice of len 2 starting at idx 3" -//! ) //! ``` +//! # use arrow_array::{Array, Int32Array, StringArray}; +//! # use arrow_buffer::OffsetBuffer; +//! # +//! // Create a Int32Array from Vec without copying +//! let array = Int32Array::new(vec![1, 2, 3].into(), None); +//! assert_eq!(array.values(), &[1, 2, 3]); +//! assert_eq!(array.null_count(), 0); +//! +//! // Create a StringArray from parts +//! let offsets = OffsetBuffer::new(vec![0, 5, 10].into()); +//! let array = StringArray::new(offsets, b"helloworld".into(), None); +//! let values: Vec<_> = array.iter().map(|x| x.unwrap()).collect(); +//! assert_eq!(values, &["hello", "world"]); +//! ``` +//! +//! As [`Buffer`], and its derivatives, can be created from [`Vec`] without copying, this provides +//! an efficient way to not only interoperate with other Rust code, but also implement kernels +//! optimised for the arrow data layout - e.g. by handling buffers instead of values. //! //! # Zero-Copy Slicing //! @@ -122,32 +115,54 @@ //! data. Internally this just increments some ref-counts, and so is incredibly cheap //! //! ```rust -//! # use std::sync::Arc; -//! # use arrow_array::{ArrayRef, Int32Array}; -//! let array = Arc::new(Int32Array::from_iter([1, 2, 3])) as ArrayRef; +//! # use arrow_array::Int32Array; +//! let array = Int32Array::from_iter([1, 2, 3]); //! //! // Slice with offset 1 and length 2 //! let sliced = array.slice(1, 2); -//! let ints = sliced.as_any().downcast_ref::().unwrap(); -//! assert_eq!(ints.values(), &[2, 3]); +//! assert_eq!(sliced.values(), &[2, 3]); //! ``` //! -//! # Internal Representation +//! # Downcasting an Array //! -//! Internally, arrays are represented by one or several [`Buffer`], the number and meaning of -//! which depend on the array’s data type, as documented in the [Arrow specification]. +//! Arrays are often passed around as a dynamically typed [`&dyn Array`] or [`ArrayRef`]. +//! For example, [`RecordBatch`](`crate::RecordBatch`) stores columns as [`ArrayRef`]. //! -//! For example, the type [`Int16Array`] represents an array of 16-bit integers and consists of: +//! Whilst these arrays can be passed directly to the [`compute`], [`csv`], [`json`], etc... APIs, +//! it is often the case that you wish to interact with the concrete arrays directly. //! -//! * An optional [`NullBuffer`] identifying any null values -//! * A contiguous [`Buffer`] of 16-bit integers +//! This requires downcasting to the concrete type of the array: //! -//! Similarly, the type [`StringArray`] represents an array of UTF-8 strings and consists of: +//! ``` +//! # use arrow_array::{Array, Float32Array, Int32Array}; //! -//! * An optional [`NullBuffer`] identifying any null values -//! * An offsets [`Buffer`] of 32-bit integers identifying valid UTF-8 sequences within the values buffer -//! * A values [`Buffer`] of UTF-8 encoded string data +//! fn sum_int32(array: &dyn Array) -> i32 { +//! let integers: &Int32Array = array.as_any().downcast_ref().unwrap(); +//! integers.iter().map(|val| val.unwrap_or_default()).sum() +//! } +//! +//! // Note: the values for positions corresponding to nulls will be arbitrary +//! fn as_f32_slice(array: &dyn Array) -> &[f32] { +//! array.as_any().downcast_ref::().unwrap().values() +//! } +//! ``` +//! +//! The [`cast::AsArray`] extension trait can make this more ergonomic //! +//! ``` +//! # use arrow_array::Array; +//! # use arrow_array::cast::{AsArray, as_primitive_array}; +//! # use arrow_array::types::Float32Type; +//! +//! fn as_f32_slice(array: &dyn Array) -> &[f32] { +//! array.as_primitive::().values() +//! } +//! ``` +//! +//! [`ScalarBuffer`]: arrow_buffer::ScalarBuffer +//! [`ScalarBuffer`]: arrow_buffer::ScalarBuffer +//! [`ScalarBuffer`]: arrow_buffer::ScalarBuffer +//! [`NullBuffer`]: arrow_buffer::NullBuffer //! [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html //! [`&dyn Array`]: Array //! [`NullBuffer`]: arrow_buffer::NullBuffer diff --git a/arrow-buffer/src/alloc/mod.rs b/arrow-buffer/src/alloc/mod.rs index d1236eeaa9a6..a3cb6253f324 100644 --- a/arrow-buffer/src/alloc/mod.rs +++ b/arrow-buffer/src/alloc/mod.rs @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Defines memory-related functions, such as allocate/deallocate/reallocate memory -//! regions, cache and allocation alignments. +//! Defines the low-level [`Allocation`] API for shared memory regions use std::alloc::Layout; use std::fmt::{Debug, Formatter}; diff --git a/arrow-buffer/src/buffer/mod.rs b/arrow-buffer/src/buffer/mod.rs index ed53d3361daa..d33e68795e4e 100644 --- a/arrow-buffer/src/buffer/mod.rs +++ b/arrow-buffer/src/buffer/mod.rs @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! This module contains two main structs: [Buffer] and [MutableBuffer]. A buffer represents -//! a contiguous memory region that can be shared via `offsets`. +//! Types of shared memory region mod offset; pub use offset::*; diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 9a905a3223b6..43c1cd004c92 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -36,7 +36,7 @@ use super::Buffer; /// Use [MutableBuffer::push] to insert an item, [MutableBuffer::extend_from_slice] /// to insert many items, and `into` to convert it to [`Buffer`]. /// -/// For a safe, strongly typed API consider using `Vec` +/// For a safe, strongly typed API consider using [`Vec`] and [`ScalarBuffer`](crate::ScalarBuffer) /// /// Note: this may be deprecated in a future release ([#1176](https://github.com/apache/arrow-rs/issues/1176)) /// diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs index cdb0c2aeb824..60987be6e415 100644 --- a/arrow-buffer/src/buffer/null.rs +++ b/arrow-buffer/src/buffer/null.rs @@ -19,6 +19,13 @@ use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator}; use crate::buffer::BooleanBuffer; use crate::{Buffer, MutableBuffer}; +/// A [`BooleanBuffer`] used to encode validity for arrow arrays +/// +/// As per the [Arrow specification], array validity is encoded in a packed bitmask with a +/// `true` value indicating the corresponding slot is not null, and `false` indicating +/// that it is null. +/// +/// [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps #[derive(Debug, Clone, Eq, PartialEq)] pub struct NullBuffer { buffer: BooleanBuffer, diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 1a4680111bd1..abde4bfd0d97 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -22,12 +22,24 @@ use std::fmt::Formatter; use std::marker::PhantomData; use std::ops::Deref; -/// Provides a safe API for interpreting a [`Buffer`] as a slice of [`ArrowNativeType`] +/// A strongly-typed [`Buffer`] supporting zero-copy cloning and slicing /// -/// # Safety +/// The easiest way to think about `ScalarBuffer` is being equivalent to a `Arc>`, +/// with the following differences: /// -/// All [`ArrowNativeType`] are valid for all possible backing byte representations, and as -/// a result they are "trivially safely transmutable". +/// - slicing and cloning is O(1). +/// - it supports external allocated memory +/// +/// ``` +/// # use arrow_buffer::ScalarBuffer; +/// // Zero-copy conversion from Vec +/// let buffer = ScalarBuffer::from(vec![1, 2, 3]); +/// assert_eq!(&buffer, &[1, 2, 3]); +/// +/// // Zero-copy slicing +/// let sliced = buffer.slice(1, 2); +/// assert_eq!(&buffer, &[2, 3]); +/// ``` #[derive(Clone)] pub struct ScalarBuffer { /// Underlying data buffer diff --git a/arrow-buffer/src/util/bit_iterator.rs b/arrow-buffer/src/util/bit_iterator.rs index 1a8dd9226318..4e24ccdabec0 100644 --- a/arrow-buffer/src/util/bit_iterator.rs +++ b/arrow-buffer/src/util/bit_iterator.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! Types for iterating over packed bitmasks + use crate::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator}; use crate::bit_util::{ceil, get_bit_raw}; diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 27c905ba0cd6..af5972acc97e 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -92,7 +92,7 @@ //! assert_eq!(sum(&TimestampNanosecondArray::from(vec![1, 2, 3])), 6); //! ``` //! -//! And the following is generic over all arrays with comparable values +//! And the following is generic over all arrays with comparable values: //! //! ```rust //! # use arrow::array::{ArrayAccessor, ArrayIter, Int32Array, StringArray}; @@ -109,7 +109,7 @@ //! assert_eq!(min(&StringArray::from(vec!["b", "a", "c"])), Some("a")); //! ``` //! -//! For more examples, consult the [arrow_array] docs. +//! For more examples, and details consult the [arrow_array] docs. //! //! # Type Erasure / Trait Objects //! @@ -317,19 +317,6 @@ //! assert_eq!(string.value(1), "foo"); //! ``` //! -//! # Memory and Buffers -//! -//! Advanced users may wish to interact with the underlying buffers of an [`Array`], for example, -//! for FFI or high-performance conversion from other formats. This interface is provided by -//! [`ArrayData`] which stores the [`Buffer`] comprising an [`Array`], and can be accessed -//! with [`Array::to_data`](array::Array::to_data) -//! -//! The APIs for constructing [`ArrayData`] come in safe, and unsafe variants, with the former -//! performing extensive, but potentially expensive validation to ensure the buffers are well-formed. -//! -//! An [`ArrayRef`] can be cheaply created from an [`ArrayData`] using [`make_array`], -//! or by using the appropriate [`From`] conversion on the concrete [`Array`] implementation. -//! //! # Safety and Security //! //! Like many crates, this crate makes use of unsafe where prudent. However, it endeavours to be