From 3f75972d841aded6bbfe6264c63f9c8e1667b115 Mon Sep 17 00:00:00 2001 From: Clement Rey Date: Thu, 13 Apr 2023 16:53:53 +0200 Subject: [PATCH 1/4] Taking care of List, LargeList, FixedSizeList and Map --- src/array/fixed_size_list/mod.rs | 4 +- src/array/fixed_size_list/mutable.rs | 2 +- src/array/list/mod.rs | 4 +- src/array/list/mutable.rs | 2 +- src/compute/cast/mod.rs | 2 +- src/datatypes/mod.rs | 69 +++++++++++++++++++---- src/ffi/schema.rs | 24 ++++---- src/io/avro/read/schema.rs | 2 +- src/io/ipc/read/schema.rs | 8 +-- src/io/json/read/deserialize.rs | 6 +- src/io/json/read/infer_schema.rs | 26 ++++----- src/io/json_integration/read/schema.rs | 10 ++-- src/io/parquet/read/schema/convert.rs | 46 +++++++-------- src/io/parquet/read/statistics/mod.rs | 6 +- src/io/parquet/write/mod.rs | 2 +- src/io/parquet/write/pages.rs | 4 +- tests/it/array/fixed_size_list/mod.rs | 21 +++++-- tests/it/array/fixed_size_list/mutable.rs | 4 +- tests/it/array/growable/map.rs | 6 +- tests/it/array/growable/union.rs | 2 +- tests/it/array/map/mod.rs | 2 +- tests/it/array/mod.rs | 10 ++-- tests/it/arrow.rs | 23 ++++++-- tests/it/compute/aggregate/memory.rs | 2 +- tests/it/compute/cast.rs | 10 ++-- tests/it/compute/filter.rs | 4 +- tests/it/ffi/data.rs | 6 +- tests/it/io/avro/read.rs | 6 +- tests/it/io/avro/write.rs | 8 +-- tests/it/io/json/read.rs | 12 ++-- tests/it/io/json/write.rs | 2 +- tests/it/io/ndjson/mod.rs | 8 +-- tests/it/io/ndjson/read.rs | 4 +- tests/it/io/parquet/mod.rs | 43 ++++++++++---- tests/it/scalar/fixed_size_list.rs | 4 +- tests/it/scalar/list.rs | 4 +- tests/it/scalar/map.rs | 4 +- 37 files changed, 249 insertions(+), 153 deletions(-) diff --git a/src/array/fixed_size_list/mod.rs b/src/array/fixed_size_list/mod.rs index 0d335167b20..9af967a06d3 100644 --- a/src/array/fixed_size_list/mod.rs +++ b/src/array/fixed_size_list/mod.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use crate::{ bitmap::Bitmap, datatypes::{DataType, Field}, @@ -204,7 +206,7 @@ impl FixedSizeListArray { /// Returns a [`DataType`] consistent with [`FixedSizeListArray`]. pub fn default_datatype(data_type: DataType, size: usize) -> DataType { - let field = Box::new(Field::new("item", data_type, true)); + let field = Arc::new(Field::new("item", data_type, true)); DataType::FixedSizeList(field, size) } } diff --git a/src/array/fixed_size_list/mutable.rs b/src/array/fixed_size_list/mutable.rs index d929f75e6e8..f12c4d13e80 100644 --- a/src/array/fixed_size_list/mutable.rs +++ b/src/array/fixed_size_list/mutable.rs @@ -41,7 +41,7 @@ impl MutableFixedSizeListArray { /// Creates a new [`MutableFixedSizeListArray`] from a [`MutableArray`] and size. pub fn new_with_field(values: M, name: &str, nullable: bool, size: usize) -> Self { let data_type = DataType::FixedSizeList( - Box::new(Field::new(name, values.data_type().clone(), nullable)), + Arc::new(Field::new(name, values.data_type().clone(), nullable)), size, ); Self::new_from(values, data_type, size) diff --git a/src/array/list/mod.rs b/src/array/list/mod.rs index b7eda9b4d5c..55b4875cf75 100644 --- a/src/array/list/mod.rs +++ b/src/array/list/mod.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use crate::{ bitmap::Bitmap, datatypes::{DataType, Field}, @@ -188,7 +190,7 @@ impl ListArray { impl ListArray { /// Returns a default [`DataType`]: inner field is named "item" and is nullable pub fn default_datatype(data_type: DataType) -> DataType { - let field = Box::new(Field::new("item", data_type, true)); + let field = Arc::new(Field::new("item", data_type, true)); if O::IS_LARGE { DataType::LargeList(field) } else { diff --git a/src/array/list/mutable.rs b/src/array/list/mutable.rs index 881cb620a03..a8f74c388a0 100644 --- a/src/array/list/mutable.rs +++ b/src/array/list/mutable.rs @@ -127,7 +127,7 @@ impl MutableListArray { /// Creates a new [`MutableListArray`] from a [`MutableArray`]. pub fn new_with_field(values: M, name: &str, nullable: bool) -> Self { - let field = Box::new(Field::new(name, values.data_type().clone(), nullable)); + let field = Arc::new(Field::new(name, values.data_type().clone(), nullable)); let data_type = if O::IS_LARGE { DataType::LargeList(field) } else { diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index e42f769e7e5..9541a1d0e79 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -389,7 +389,7 @@ fn cast_list_to_fixed_size_list( None => { let new_values = cast(list.values().as_ref(), inner.data_type(), options)?; Ok(FixedSizeListArray::new( - DataType::FixedSizeList(Box::new(inner.clone()), size), + DataType::FixedSizeList(std::sync::Arc::new(inner.clone()), size), new_values, list.validity().cloned(), )) diff --git a/src/datatypes/mod.rs b/src/datatypes/mod.rs index 2582bb7a6cd..8866446b798 100644 --- a/src/datatypes/mod.rs +++ b/src/datatypes/mod.rs @@ -20,6 +20,47 @@ pub type Metadata = BTreeMap; /// typedef fpr [Option<(String, Option)>] descr pub(crate) type Extension = Option<(String, Option)>; +/// An extension trait to polyfill [`Arc::unwrap_or_clone`] from the nightly stdlib. +pub trait ArcExt { + /// If we have the only reference to `T` then unwrap it. Otherwise, clone `T` and return the + /// clone. + /// + /// Assuming `arc_t` is of type `Arc`, this function is functionally equivalent to + /// `(*arc_t).clone()`, but will avoid cloning the inner value where possible. + /// + /// # Examples + /// + /// ``` + /// # use std::{ptr, sync::Arc}; + /// # use arrow2::datatype::ArcExt; + /// let inner = String::from("test"); + /// let ptr = inner.as_ptr(); + /// + /// let arc = Arc::new(inner); + /// let inner = Arc::unwrap_or_clone(arc); + /// // The inner value was not cloned + /// assert!(ptr::eq(ptr, inner.as_ptr())); + /// + /// let arc = Arc::new(inner); + /// let arc2 = arc.clone(); + /// let inner = Arc::unwrap_or_clone(arc); + /// // Because there were 2 references, we had to clone the inner value. + /// assert!(!ptr::eq(ptr, inner.as_ptr())); + /// // `arc2` is the last reference, so when we unwrap it we get back + /// // the original `String`. + /// let inner = Arc::unwrap_or_clone(arc2); + /// assert!(ptr::eq(ptr, inner.as_ptr())); + /// ``` + fn unwrap_or_clone_polyfill(this: Self) -> T; +} + +impl ArcExt for Arc { + #[inline] + fn unwrap_or_clone_polyfill(this: Self) -> T { + Arc::try_unwrap(this).unwrap_or_else(|arc| (*arc).clone()) + } +} + /// The set of supported logical types in this crate. /// /// Each variant uniquely identifies a logical type, which define specific semantics to the data @@ -100,11 +141,11 @@ pub enum DataType { /// A variable-length UTF-8 encoded string whose offsets are represented as [`i64`]. LargeUtf8, /// A list of some logical data type whose offsets are represented as [`i32`]. - List(Box), + List(Arc), /// A list of some logical data type with a fixed number of elements. - FixedSizeList(Box, usize), + FixedSizeList(Arc, usize), /// A list of some logical data type whose offsets are represented as [`i64`]. - LargeList(Box), + LargeList(Arc), /// A nested [`DataType`] with a given number of [`Field`]s. Struct(Vec), /// A nested datatype that can represent slots of differing types. @@ -135,7 +176,7 @@ pub enum DataType { /// The metadata is structured so that Arrow systems without special handling /// for Map can make Map an alias for List. The "layout" attribute for the Map /// field must have the same contents as a List. - Map(Box, bool), + Map(Arc, bool), /// A dictionary encoded array (`key_type`, `value_type`), where /// each array element is an index of `key_type` into an /// associated dictionary of `value_type`. @@ -189,11 +230,13 @@ impl From for arrow_schema::DataType { DataType::LargeBinary => Self::LargeBinary, DataType::Utf8 => Self::Utf8, DataType::LargeUtf8 => Self::LargeUtf8, - DataType::List(f) => Self::List(Box::new((*f).into())), + DataType::List(f) => Self::List(Box::new(Arc::unwrap_or_clone_polyfill(f).into())), DataType::FixedSizeList(f, size) => { - Self::FixedSizeList(Box::new((*f).into()), size as _) + Self::FixedSizeList(Box::new(Arc::unwrap_or_clone_polyfill(f).into()), size as _) + } + DataType::LargeList(f) => { + Self::LargeList(Box::new(Arc::unwrap_or_clone_polyfill(f).into())) } - DataType::LargeList(f) => Self::LargeList(Box::new((*f).into())), DataType::Struct(f) => Self::Struct(f.into_iter().map(Into::into).collect()), DataType::Union(fields, Some(ids), mode) => { let ids = ids.into_iter().map(|x| x as _).collect(); @@ -205,7 +248,9 @@ impl From for arrow_schema::DataType { let fields = fields.into_iter().map(Into::into).collect(); Self::Union(fields, ids, mode.into()) } - DataType::Map(f, ordered) => Self::Map(Box::new((*f).into()), ordered), + DataType::Map(f, ordered) => { + Self::Map(Box::new(Arc::unwrap_or_clone_polyfill(f).into()), ordered) + } DataType::Dictionary(key, value, _) => Self::Dictionary( Box::new(DataType::from(key).into()), Box::new((*value).into()), @@ -247,18 +292,18 @@ impl From for DataType { DataType::LargeBinary => Self::LargeBinary, DataType::Utf8 => Self::Utf8, DataType::LargeUtf8 => Self::LargeUtf8, - DataType::List(f) => Self::List(Box::new((*f).into())), + DataType::List(f) => Self::List(Arc::new((*f).into())), DataType::FixedSizeList(f, size) => { - Self::FixedSizeList(Box::new((*f).into()), size as _) + Self::FixedSizeList(Arc::new((*f).into()), size as _) } - DataType::LargeList(f) => Self::LargeList(Box::new((*f).into())), + DataType::LargeList(f) => Self::LargeList(Arc::new((*f).into())), DataType::Struct(f) => Self::Struct(f.into_iter().map(Into::into).collect()), DataType::Union(fields, ids, mode) => { let ids = ids.into_iter().map(|x| x as _).collect(); let fields = fields.into_iter().map(Into::into).collect(); Self::Union(fields, Some(ids), mode.into()) } - DataType::Map(f, ordered) => Self::Map(Box::new((*f).into()), ordered), + DataType::Map(f, ordered) => Self::Map(std::sync::Arc::new((*f).into()), ordered), DataType::Dictionary(key, value) => { let key = match *key { DataType::Int8 => IntegerType::Int8, diff --git a/src/ffi/schema.rs b/src/ffi/schema.rs index 2751583ef1b..2e86dea49fe 100644 --- a/src/ffi/schema.rs +++ b/src/ffi/schema.rs @@ -1,4 +1,4 @@ -use std::{collections::BTreeMap, convert::TryInto, ffi::CStr, ffi::CString, ptr}; +use std::{collections::BTreeMap, convert::TryInto, ffi::CStr, ffi::CString, ptr, sync::Arc}; use crate::{ datatypes::{ @@ -260,17 +260,17 @@ unsafe fn to_data_type(schema: &ArrowSchema) -> Result { "tiD" => DataType::Interval(IntervalUnit::DayTime), "+l" => { let child = schema.child(0); - DataType::List(Box::new(to_field(child)?)) + DataType::List(Arc::new(to_field(child)?)) } "+L" => { let child = schema.child(0); - DataType::LargeList(Box::new(to_field(child)?)) + DataType::LargeList(Arc::new(to_field(child)?)) } "+m" => { let child = schema.child(0); let is_sorted = (schema.flags & 4) != 0; - DataType::Map(Box::new(to_field(child)?), is_sorted) + DataType::Map(std::sync::Arc::new(to_field(child)?), is_sorted) } "+s" => { let children = (0..schema.n_children as usize) @@ -305,7 +305,7 @@ unsafe fn to_data_type(schema: &ArrowSchema) -> Result { .parse::() .map_err(|_| Error::OutOfSpec("size is not a valid integer".to_string()))?; let child = to_field(schema.child(0))?; - DataType::FixedSizeList(Box::new(child), size) + DataType::FixedSizeList(Arc::new(child), size) } ["d", raw] => { // Decimal @@ -565,24 +565,24 @@ mod tests { DataType::Binary, DataType::LargeBinary, DataType::FixedSizeBinary(2), - DataType::List(Box::new(Field::new("example", DataType::Boolean, false))), - DataType::FixedSizeList(Box::new(Field::new("example", DataType::Boolean, false)), 2), - DataType::LargeList(Box::new(Field::new("example", DataType::Boolean, false))), + DataType::List(Arc::new(Field::new("example", DataType::Boolean, false))), + DataType::FixedSizeList(Arc::new(Field::new("example", DataType::Boolean, false)), 2), + DataType::LargeList(Arc::new(Field::new("example", DataType::Boolean, false))), DataType::Struct(vec![ Field::new("a", DataType::Int64, true), Field::new( "b", - DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), true, ), ]), - DataType::Map(Box::new(Field::new("a", DataType::Int64, true)), true), + DataType::Map(std::sync::Arc::new(Field::new("a", DataType::Int64, true)), true), DataType::Union( vec![ Field::new("a", DataType::Int64, true), Field::new( "b", - DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), true, ), ], @@ -594,7 +594,7 @@ mod tests { Field::new("a", DataType::Int64, true), Field::new( "b", - DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), true, ), ], diff --git a/src/io/avro/read/schema.rs b/src/io/avro/read/schema.rs index 3a920286c03..5999503941f 100644 --- a/src/io/avro/read/schema.rs +++ b/src/io/avro/read/schema.rs @@ -77,7 +77,7 @@ fn schema_to_field(schema: &AvroSchema, name: Option<&str>, props: Metadata) -> None => DataType::Binary, }, AvroSchema::String(_) => DataType::Utf8, - AvroSchema::Array(item_schema) => DataType::List(Box::new(schema_to_field( + AvroSchema::Array(item_schema) => DataType::List(std::sync::Arc::new(schema_to_field( item_schema, Some("item"), // default name for list items Metadata::default(), diff --git a/src/io/ipc/read/schema.rs b/src/io/ipc/read/schema.rs index 7ec87eaa334..eb71cee1ea5 100644 --- a/src/io/ipc/read/schema.rs +++ b/src/io/ipc/read/schema.rs @@ -145,7 +145,7 @@ fn deserialize_map(map: MapRef, field: FieldRef) -> Result<(DataType, IpcField)> .ok_or_else(|| Error::oos("IPC: Map must contain one child"))??; let (field, ipc_field) = deserialize_field(inner)?; - let data_type = DataType::Map(Box::new(field), is_sorted); + let data_type = DataType::Map(std::sync::Arc::new(field), is_sorted); Ok(( data_type, IpcField { @@ -183,7 +183,7 @@ fn deserialize_list(field: FieldRef) -> Result<(DataType, IpcField)> { let (field, ipc_field) = deserialize_field(inner)?; Ok(( - DataType::List(Box::new(field)), + DataType::List(std::sync::Arc::new(field)), IpcField { fields: vec![ipc_field], dictionary_id: None, @@ -201,7 +201,7 @@ fn deserialize_large_list(field: FieldRef) -> Result<(DataType, IpcField)> { let (field, ipc_field) = deserialize_field(inner)?; Ok(( - DataType::LargeList(Box::new(field)), + DataType::LargeList(std::sync::Arc::new(field)), IpcField { fields: vec![ipc_field], dictionary_id: None, @@ -227,7 +227,7 @@ fn deserialize_fixed_size_list( .map_err(|_| Error::from(OutOfSpecKind::NegativeFooterLength))?; Ok(( - DataType::FixedSizeList(Box::new(field), size), + DataType::FixedSizeList(std::sync::Arc::new(field), size), IpcField { fields: vec![ipc_field], dictionary_id: None, diff --git a/src/io/json/read/deserialize.rs b/src/io/json/read/deserialize.rs index 5768eaa74eb..bfa250ab313 100644 --- a/src/io/json/read/deserialize.rs +++ b/src/io/json/read/deserialize.rs @@ -535,10 +535,10 @@ pub(crate) fn _deserialize<'a, A: Borrow>>( let iter = rows.iter().map(|row| match row.borrow() { Value::Number(v) => Some(deserialize_int_single(*v)), Value::String(v) => match (tu, tz) { - (_, None) => temporal_conversions::utf8_to_naive_timestamp_scalar(v, "%+", &tu), + (_, None) => temporal_conversions::utf8_to_naive_timestamp_scalar(v, "%+", tu), (_, Some(ref tz)) => { let tz = temporal_conversions::parse_offset(tz).unwrap(); - temporal_conversions::utf8_to_timestamp_scalar(v, "%+", &tz, &tu) + temporal_conversions::utf8_to_timestamp_scalar(v, "%+", &tz, tu) } }, _ => None, @@ -599,7 +599,7 @@ pub fn deserialize(json: &Value, data_type: DataType) -> Result, match json { Value::Array(rows) => match data_type { DataType::List(inner) | DataType::LargeList(inner) => { - Ok(_deserialize(rows, inner.data_type)) + Ok(_deserialize(rows, inner.data_type.clone())) } _ => Err(Error::nyi("read an Array from a non-Array data type")), }, diff --git a/src/io/json/read/infer_schema.rs b/src/io/json/read/infer_schema.rs index f098bf80d3b..13f0c50360f 100644 --- a/src/io/json/read/infer_schema.rs +++ b/src/io/json/read/infer_schema.rs @@ -38,7 +38,7 @@ pub fn infer_records_schema(json: &Value) -> Result { Ok(Field { name: name.clone(), - data_type: DataType::List(Box::new(Field { + data_type: DataType::List(std::sync::Arc::new(Field { name: format!("{name}-records"), data_type, is_nullable: true, @@ -105,7 +105,7 @@ fn infer_array(values: &[Value]) -> Result { Ok(if dt == DataType::Null { dt } else { - DataType::List(Box::new(Field::new(ITEM_NAME, dt, true))) + DataType::List(std::sync::Arc::new(Field::new(ITEM_NAME, dt, true))) }) } @@ -180,15 +180,15 @@ pub(crate) fn coerce_data_type>(datatypes: &[A]) -> DataType (lhs, rhs) if lhs == rhs => lhs.clone(), (List(lhs), List(rhs)) => { let inner = coerce_data_type(&[lhs.data_type(), rhs.data_type()]); - List(Box::new(Field::new(ITEM_NAME, inner, true))) + List(std::sync::Arc::new(Field::new(ITEM_NAME, inner, true))) } (scalar, List(list)) => { let inner = coerce_data_type(&[scalar, list.data_type()]); - List(Box::new(Field::new(ITEM_NAME, inner, true))) + List(std::sync::Arc::new(Field::new(ITEM_NAME, inner, true))) } (List(list), scalar) => { let inner = coerce_data_type(&[scalar, list.data_type()]); - List(Box::new(Field::new(ITEM_NAME, inner, true))) + List(std::sync::Arc::new(Field::new(ITEM_NAME, inner, true))) } (Float64, Int64) => Float64, (Int64, Float64) => Float64, @@ -209,25 +209,25 @@ mod test { assert_eq!( coerce_data_type(&[ Float64, - List(Box::new(Field::new(ITEM_NAME, Float64, true))) + List(std::sync::Arc::new(Field::new(ITEM_NAME, Float64, true))) ]), - List(Box::new(Field::new(ITEM_NAME, Float64, true))), + List(std::sync::Arc::new(Field::new(ITEM_NAME, Float64, true))), ); assert_eq!( - coerce_data_type(&[Float64, List(Box::new(Field::new(ITEM_NAME, Int64, true)))]), - List(Box::new(Field::new(ITEM_NAME, Float64, true))), + coerce_data_type(&[Float64, List(std::sync::Arc::new(Field::new(ITEM_NAME, Int64, true)))]), + List(std::sync::Arc::new(Field::new(ITEM_NAME, Float64, true))), ); assert_eq!( - coerce_data_type(&[Int64, List(Box::new(Field::new(ITEM_NAME, Int64, true)))]), - List(Box::new(Field::new(ITEM_NAME, Int64, true))), + coerce_data_type(&[Int64, List(std::sync::Arc::new(Field::new(ITEM_NAME, Int64, true)))]), + List(std::sync::Arc::new(Field::new(ITEM_NAME, Int64, true))), ); // boolean and number are incompatible, return utf8 assert_eq!( coerce_data_type(&[ Boolean, - List(Box::new(Field::new(ITEM_NAME, Float64, true))) + List(std::sync::Arc::new(Field::new(ITEM_NAME, Float64, true))) ]), - List(Box::new(Field::new(ITEM_NAME, Utf8, true))), + List(std::sync::Arc::new(Field::new(ITEM_NAME, Utf8, true))), ); } diff --git a/src/io/json_integration/read/schema.rs b/src/io/json_integration/read/schema.rs index 6a54dafc2b1..e127b4852d2 100644 --- a/src/io/json_integration/read/schema.rs +++ b/src/io/json_integration/read/schema.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use serde_derive::Deserialize; use serde_json::Value; @@ -243,12 +245,12 @@ fn to_data_type(item: &Value, mut children: Vec) -> Result { } }, "int" => to_int(item).map(|x| x.into())?, - "list" => DataType::List(Box::new(children.pop().unwrap())), - "largelist" => DataType::LargeList(Box::new(children.pop().unwrap())), + "list" => DataType::List(std::sync::Arc::new(children.pop().unwrap())), + "largelist" => DataType::LargeList(std::sync::Arc::new(children.pop().unwrap())), "fixedsizelist" => { if let Some(Value::Number(size)) = item.get("listSize") { DataType::FixedSizeList( - Box::new(children.pop().unwrap()), + Arc::new(children.pop().unwrap()), size.as_i64().unwrap() as usize, ) } else { @@ -277,7 +279,7 @@ fn to_data_type(item: &Value, mut children: Vec) -> Result { } else { return Err(Error::OutOfSpec("sorted keys not defined".to_string())); }; - DataType::Map(Box::new(children.pop().unwrap()), sorted_keys) + DataType::Map(std::sync::Arc::new(children.pop().unwrap()), sorted_keys) } other => { return Err(Error::NotYetImplemented(format!( diff --git a/src/io/parquet/read/schema/convert.rs b/src/io/parquet/read/schema/convert.rs index 821d5107649..c795f724232 100644 --- a/src/io/parquet/read/schema/convert.rs +++ b/src/io/parquet/read/schema/convert.rs @@ -199,7 +199,7 @@ fn to_primitive_type(primitive_type: &PrimitiveType) -> DataType { let base_type = to_primitive_type_inner(primitive_type); if primitive_type.field_info.repetition == Repetition::Repeated { - DataType::List(Box::new(Field::new( + DataType::List(std::sync::Arc::new(Field::new( &primitive_type.field_info.name, base_type, is_nullable(&primitive_type.field_info), @@ -242,7 +242,7 @@ fn to_struct(fields: &[ParquetType]) -> Option { /// Returns [`None`] if all its fields are empty fn to_map(fields: &[ParquetType]) -> Option { let inner = to_field(&fields[0])?; - Some(DataType::Map(Box::new(inner), false)) + Some(DataType::Map(std::sync::Arc::new(inner), false)) } /// Entry point for converting parquet group type. @@ -257,7 +257,7 @@ fn to_group_type( ) -> Option { debug_assert!(!fields.is_empty()); if field_info.repetition == Repetition::Repeated { - Some(DataType::List(Box::new(Field::new( + Some(DataType::List(std::sync::Arc::new(Field::new( &field_info.name, to_struct(fields)?, is_nullable(field_info), @@ -330,7 +330,7 @@ fn to_list(fields: &[ParquetType], parent_name: &str) -> Option { ), }; - Some(DataType::List(Box::new(Field::new( + Some(DataType::List(std::sync::Arc::new(Field::new( list_item_name, item_type, item_is_optional, @@ -521,7 +521,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("element", DataType::Utf8, true))), + DataType::List(std::sync::Arc::new(Field::new("element", DataType::Utf8, true))), false, )); } @@ -535,7 +535,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("element", DataType::Utf8, false))), + DataType::List(std::sync::Arc::new(Field::new("element", DataType::Utf8, false))), true, )); } @@ -554,10 +554,10 @@ mod tests { // } { let arrow_inner_list = - DataType::List(Box::new(Field::new("element", DataType::Int32, false))); + DataType::List(std::sync::Arc::new(Field::new("element", DataType::Int32, false))); arrow_fields.push(Field::new( "array_of_arrays", - DataType::List(Box::new(Field::new("element", arrow_inner_list, false))), + DataType::List(std::sync::Arc::new(Field::new("element", arrow_inner_list, false))), true, )); } @@ -571,7 +571,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("element", DataType::Utf8, true))), + DataType::List(std::sync::Arc::new(Field::new("element", DataType::Utf8, true))), true, )); } @@ -583,7 +583,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("element", DataType::Int32, true))), + DataType::List(std::sync::Arc::new(Field::new("element", DataType::Int32, true))), true, )); } @@ -602,7 +602,7 @@ mod tests { ]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("element", arrow_struct, true))), + DataType::List(std::sync::Arc::new(Field::new("element", arrow_struct, true))), true, )); } @@ -618,7 +618,7 @@ mod tests { let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("array", arrow_struct, true))), + DataType::List(std::sync::Arc::new(Field::new("array", arrow_struct, true))), true, )); } @@ -634,7 +634,7 @@ mod tests { let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); arrow_fields.push(Field::new( "my_list", - DataType::List(Box::new(Field::new("my_list_tuple", arrow_struct, true))), + DataType::List(std::sync::Arc::new(Field::new("my_list_tuple", arrow_struct, true))), true, )); } @@ -644,7 +644,7 @@ mod tests { { arrow_fields.push(Field::new( "name", - DataType::List(Box::new(Field::new("name", DataType::Int32, true))), + DataType::List(std::sync::Arc::new(Field::new("name", DataType::Int32, true))), true, )); } @@ -689,7 +689,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list1", - DataType::List(Box::new(Field::new("element", DataType::Utf8, true))), + DataType::List(std::sync::Arc::new(Field::new("element", DataType::Utf8, true))), false, )); } @@ -703,7 +703,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list2", - DataType::List(Box::new(Field::new("element", DataType::Utf8, false))), + DataType::List(std::sync::Arc::new(Field::new("element", DataType::Utf8, false))), true, )); } @@ -717,7 +717,7 @@ mod tests { { arrow_fields.push(Field::new( "my_list3", - DataType::List(Box::new(Field::new("element", DataType::Utf8, false))), + DataType::List(std::sync::Arc::new(Field::new("element", DataType::Utf8, false))), false, )); } @@ -769,7 +769,7 @@ mod tests { let inner_group_list = Field::new( "innerGroup", - DataType::List(Box::new(Field::new( + DataType::List(std::sync::Arc::new(Field::new( "innerGroup", DataType::Struct(vec![Field::new("leaf3", DataType::Int32, true)]), true, @@ -779,7 +779,7 @@ mod tests { let outer_group_list = Field::new( "outerGroup", - DataType::List(Box::new(Field::new( + DataType::List(std::sync::Arc::new(Field::new( "outerGroup", DataType::Struct(vec![ Field::new("leaf2", DataType::Int32, true), @@ -848,7 +848,7 @@ mod tests { Field::new("string", DataType::Utf8, true), Field::new( "bools", - DataType::List(Box::new(Field::new("bools", DataType::Boolean, true))), + DataType::List(std::sync::Arc::new(Field::new("bools", DataType::Boolean, true))), true, ), Field::new("date", DataType::Date32, true), @@ -930,12 +930,12 @@ mod tests { Field::new("string", DataType::Utf8, true), Field::new( "bools", - DataType::List(Box::new(Field::new("element", DataType::Boolean, true))), + DataType::List(std::sync::Arc::new(Field::new("element", DataType::Boolean, true))), true, ), Field::new( "bools_non_null", - DataType::List(Box::new(Field::new("element", DataType::Boolean, false))), + DataType::List(std::sync::Arc::new(Field::new("element", DataType::Boolean, false))), false, ), Field::new("date", DataType::Date32, true), @@ -958,7 +958,7 @@ mod tests { Field::new("uint32", DataType::UInt32, false), Field::new( "int32", - DataType::List(Box::new(Field::new("element", DataType::Int32, true))), + DataType::List(std::sync::Arc::new(Field::new("element", DataType::Int32, true))), false, ), ]), diff --git a/src/io/parquet/read/statistics/mod.rs b/src/io/parquet/read/statistics/mod.rs index f3c1ed9e8de..b2f1766c015 100644 --- a/src/io/parquet/read/statistics/mod.rs +++ b/src/io/parquet/read/statistics/mod.rs @@ -212,17 +212,17 @@ fn create_dt(data_type: &DataType) -> DataType { ) } else if let DataType::Map(f, ordered) = data_type.to_logical_type() { DataType::Map( - Box::new(Field::new(&f.name, create_dt(&f.data_type), f.is_nullable)), + Arc::new(Field::new(&f.name, create_dt(&f.data_type), f.is_nullable)), *ordered, ) } else if let DataType::List(f) = data_type.to_logical_type() { - DataType::List(Box::new(Field::new( + DataType::List(std::sync::Arc::new(Field::new( &f.name, create_dt(&f.data_type), f.is_nullable, ))) } else if let DataType::LargeList(f) = data_type.to_logical_type() { - DataType::LargeList(Box::new(Field::new( + DataType::LargeList(std::sync::Arc::new(Field::new( &f.name, create_dt(&f.data_type), f.is_nullable, diff --git a/src/io/parquet/write/mod.rs b/src/io/parquet/write/mod.rs index a0040a9a0d7..4b86d707f68 100644 --- a/src/io/parquet/write/mod.rs +++ b/src/io/parquet/write/mod.rs @@ -741,7 +741,7 @@ fn transverse_recursive T + Clone>( /// /// let dt = DataType::Struct(vec![ /// Field::new("a", DataType::Int64, true), -/// Field::new("b", DataType::List(Box::new(Field::new("item", DataType::Int32, true))), true), +/// Field::new("b", DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, true))), true), /// ]); /// /// let encodings = transverse(&dt, |dt| Encoding::Plain); diff --git a/src/io/parquet/write/pages.rs b/src/io/parquet/write/pages.rs index 10aea638a22..eae4b70250b 100644 --- a/src/io/parquet/write/pages.rs +++ b/src/io/parquet/write/pages.rs @@ -453,7 +453,7 @@ mod tests { ); let array = ListArray::new( - DataType::List(Box::new(Field::new("l", array.data_type().clone(), true))), + DataType::List(std::sync::Arc::new(Field::new("l", array.data_type().clone(), true))), vec![0i32, 2, 4].try_into().unwrap(), Box::new(array), None, @@ -545,7 +545,7 @@ mod tests { Field::new("v", DataType::Int32, false), ]); let kv_field = Field::new("kv", kv_type.clone(), false); - let map_type = DataType::Map(Box::new(kv_field), false); + let map_type = DataType::Map(std::sync::Arc::new(kv_field), false); let key_array = Utf8Array::::from_slice(["k1", "k2", "k3", "k4", "k5", "k6"]).boxed(); let val_array = Int32Array::from_slice([42, 28, 19, 31, 21, 17]).boxed(); diff --git a/tests/it/array/fixed_size_list/mod.rs b/tests/it/array/fixed_size_list/mod.rs index c2a4b11e62c..f538c65dd68 100644 --- a/tests/it/array/fixed_size_list/mod.rs +++ b/tests/it/array/fixed_size_list/mod.rs @@ -1,5 +1,7 @@ mod mutable; +use std::sync::Arc; + use arrow2::{ array::*, bitmap::Bitmap, @@ -11,7 +13,7 @@ fn data() -> FixedSizeListArray { FixedSizeListArray::try_new( DataType::FixedSizeList( - Box::new(Field::new("a", values.data_type().clone(), true)), + Arc::new(Field::new("a", values.data_type().clone(), true)), 2, ), values.boxed(), @@ -53,7 +55,7 @@ fn debug() { #[test] fn empty() { let array = FixedSizeListArray::new_empty(DataType::FixedSizeList( - Box::new(Field::new("a", DataType::Int32, true)), + Arc::new(Field::new("a", DataType::Int32, true)), 2, )); assert_eq!(array.values().len(), 0); @@ -63,7 +65,10 @@ fn empty() { #[test] fn null() { let array = FixedSizeListArray::new_null( - DataType::FixedSizeList(Box::new(Field::new("a", DataType::Int32, true)), 2), + DataType::FixedSizeList( + std::sync::Arc::new(Field::new("a", DataType::Int32, true)), + 2, + ), 2, ); assert_eq!(array.values().len(), 4); @@ -74,7 +79,10 @@ fn null() { fn wrong_size() { let values = Int32Array::from_slice([10, 20, 0]); assert!(FixedSizeListArray::try_new( - DataType::FixedSizeList(Box::new(Field::new("a", DataType::Int32, true)), 2), + DataType::FixedSizeList( + std::sync::Arc::new(Field::new("a", DataType::Int32, true)), + 2 + ), values.boxed(), None ) @@ -85,7 +93,10 @@ fn wrong_size() { fn wrong_len() { let values = Int32Array::from_slice([10, 20, 0]); assert!(FixedSizeListArray::try_new( - DataType::FixedSizeList(Box::new(Field::new("a", DataType::Int32, true)), 2), + DataType::FixedSizeList( + std::sync::Arc::new(Field::new("a", DataType::Int32, true)), + 2 + ), values.boxed(), Some([true, false, false].into()), // it should be 2 ) diff --git a/tests/it/array/fixed_size_list/mutable.rs b/tests/it/array/fixed_size_list/mutable.rs index f7a8784dfce..a267352eb70 100644 --- a/tests/it/array/fixed_size_list/mutable.rs +++ b/tests/it/array/fixed_size_list/mutable.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow2::array::*; use arrow2::datatypes::{DataType, Field}; @@ -46,7 +48,7 @@ fn new_with_field() { assert_eq!( list.data_type(), &DataType::FixedSizeList( - Box::new(Field::new("custom_items", DataType::Int32, false)), + Arc::new(Field::new("custom_items", DataType::Int32, false)), 3 ) ); diff --git a/tests/it/array/growable/map.rs b/tests/it/array/growable/map.rs index de9069f68bc..e98b98903b3 100644 --- a/tests/it/array/growable/map.rs +++ b/tests/it/array/growable/map.rs @@ -38,7 +38,7 @@ fn basic() { let kv_array = StructArray::new(fields.clone(), values, None).boxed(); let kv_field = Field::new("kv", fields, false); - let data_type = DataType::Map(Box::new(kv_field), false); + let data_type = DataType::Map(std::sync::Arc::new(kv_field), false); let offsets = OffsetsBuffer::try_from(vec![0, 1, 2, 4, 6]).unwrap(); let array = MapArray::new(data_type.clone(), offsets, kv_array.clone(), None); @@ -62,7 +62,7 @@ fn offset() { let kv_array = StructArray::new(fields.clone(), values, None).boxed(); let kv_field = Field::new("kv", fields, false); - let data_type = DataType::Map(Box::new(kv_field), false); + let data_type = DataType::Map(std::sync::Arc::new(kv_field), false); let offsets = OffsetsBuffer::try_from(vec![0, 1, 2, 4, 6]).unwrap(); let array = MapArray::new(data_type.clone(), offsets, kv_array.clone(), None).sliced(1, 3); @@ -86,7 +86,7 @@ fn nulls() { let kv_array = StructArray::new(fields.clone(), values, None).boxed(); let kv_field = Field::new("kv", fields, false); - let data_type = DataType::Map(Box::new(kv_field), false); + let data_type = DataType::Map(std::sync::Arc::new(kv_field), false); let offsets = OffsetsBuffer::try_from(vec![0, 1, 2, 4, 6]).unwrap(); let array = MapArray::new( diff --git a/tests/it/array/growable/union.rs b/tests/it/array/growable/union.rs index 21cbbebabf4..520a64092e4 100644 --- a/tests/it/array/growable/union.rs +++ b/tests/it/array/growable/union.rs @@ -75,7 +75,7 @@ fn dense() -> Result<()> { #[test] fn complex_dense() -> Result<()> { let fixed_size_type = - DataType::FixedSizeList(Box::new(Field::new("i", DataType::UInt16, true)), 3); + DataType::FixedSizeList(std::sync::Arc::new(Field::new("i", DataType::UInt16, true)), 3); let fields = vec![ Field::new("a", DataType::Int32, true), diff --git a/tests/it/array/map/mod.rs b/tests/it/array/map/mod.rs index 285bc8b39b3..1d3ab488554 100644 --- a/tests/it/array/map/mod.rs +++ b/tests/it/array/map/mod.rs @@ -9,7 +9,7 @@ fn basics() { Field::new("a", DataType::Utf8, true), Field::new("b", DataType::Utf8, true), ]); - let data_type = DataType::Map(Box::new(Field::new("a", dt.clone(), true)), false); + let data_type = DataType::Map(std::sync::Arc::new(Field::new("a", dt.clone(), true)), false); let field = StructArray::new( dt.clone(), diff --git a/tests/it/array/mod.rs b/tests/it/array/mod.rs index 85318ba628a..628daa47451 100644 --- a/tests/it/array/mod.rs +++ b/tests/it/array/mod.rs @@ -24,7 +24,7 @@ fn nulls() { DataType::Float64, DataType::Utf8, DataType::Binary, - DataType::List(Box::new(Field::new("a", DataType::Binary, true))), + DataType::List(std::sync::Arc::new(Field::new("a", DataType::Binary, true))), ]; let a = datatypes .into_iter() @@ -57,8 +57,8 @@ fn empty() { DataType::Float64, DataType::Utf8, DataType::Binary, - DataType::List(Box::new(Field::new("a", DataType::Binary, true))), - DataType::List(Box::new(Field::new( + DataType::List(std::sync::Arc::new(Field::new("a", DataType::Binary, true))), + DataType::List(std::sync::Arc::new(Field::new( "a", DataType::Extension("ext".to_owned(), Box::new(DataType::Int32), None), true, @@ -86,7 +86,7 @@ fn empty_extension() { DataType::Float64, DataType::Utf8, DataType::Binary, - DataType::List(Box::new(Field::new("a", DataType::Binary, true))), + DataType::List(std::sync::Arc::new(Field::new("a", DataType::Binary, true))), DataType::Union( vec![Field::new("a", DataType::Binary, true)], None, @@ -116,7 +116,7 @@ fn test_clone() { DataType::Float64, DataType::Utf8, DataType::Binary, - DataType::List(Box::new(Field::new("a", DataType::Binary, true))), + DataType::List(std::sync::Arc::new(Field::new("a", DataType::Binary, true))), ]; let a = datatypes .into_iter() diff --git a/tests/it/arrow.rs b/tests/it/arrow.rs index fb02fafbbc7..624b10cfc65 100644 --- a/tests/it/arrow.rs +++ b/tests/it/arrow.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow2::array::*; use arrow2::bitmap::Bitmap; use arrow2::datatypes::{DataType, Field, IntegerType, TimeUnit, UnionMode}; @@ -169,7 +171,11 @@ fn test_list() { let validity = [true, true, false, false, true].into_iter().collect(); let offsets = Offsets::try_from_iter(vec![0, 2, 2, 2, 0]).unwrap(); - let data_type = DataType::List(Box::new(Field::new("element", DataType::Utf8, true))); + let data_type = DataType::List(std::sync::Arc::new(Field::new( + "element", + DataType::Utf8, + true, + ))); let list = ListArray::::new( data_type.clone(), offsets.into(), @@ -184,7 +190,11 @@ fn test_list() { let validity = [true, true, false, false, true].into_iter().collect(); let offsets = Offsets::try_from_iter(vec![0, 2, 2, 2, 0]).unwrap(); - let data_type = DataType::LargeList(Box::new(Field::new("element", DataType::Utf8, true))); + let data_type = DataType::LargeList(std::sync::Arc::new(Field::new( + "element", + DataType::Utf8, + true, + ))); let list = ListArray::::new( data_type.clone(), offsets.into(), @@ -204,7 +214,7 @@ fn test_list_struct() { let validity = [true, true, false, true].into_iter().collect(); let offsets = Offsets::try_from_iter(vec![0, 1, 0, 2]).unwrap(); let list = ListArray::::new( - DataType::List(Box::new(Field::new( + DataType::List(std::sync::Arc::new(Field::new( "element", values.data_type().clone(), true, @@ -257,7 +267,10 @@ fn test_fixed_size_list() { let nulls = [true, true, false, true].into_iter().collect(); let array = FixedSizeListArray::new( - DataType::FixedSizeList(Box::new(Field::new("element", DataType::Int64, true)), 2), + DataType::FixedSizeList( + std::sync::Arc::new(Field::new("element", DataType::Int64, true)), + 2, + ), Box::new(values), Some(nulls), ); @@ -285,7 +298,7 @@ fn test_map() { let validity = [true, true, false, false].into_iter().collect(); let offsets = Offsets::try_from_iter(vec![0, 2, 0, 2]).unwrap(); let data_type = DataType::Map( - Box::new(Field::new("entries", fields.data_type().clone(), true)), + Arc::new(Field::new("entries", fields.data_type().clone(), true)), false, ); let map = MapArray::new( diff --git a/tests/it/compute/aggregate/memory.rs b/tests/it/compute/aggregate/memory.rs index be6ca35ee06..cfee4e7e38e 100644 --- a/tests/it/compute/aggregate/memory.rs +++ b/tests/it/compute/aggregate/memory.rs @@ -25,7 +25,7 @@ fn utf8() { #[test] fn fixed_size_list() { let data_type = - DataType::FixedSizeList(Box::new(Field::new("elem", DataType::Float32, false)), 3); + DataType::FixedSizeList(std::sync::Arc::new(Field::new("elem", DataType::Float32, false)), 3); let values = Box::new(Float32Array::from_slice([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])); let a = FixedSizeListArray::new(data_type, values, None); assert_eq!(6 * std::mem::size_of::(), estimated_bytes_size(&a)); diff --git a/tests/it/compute/cast.rs b/tests/it/compute/cast.rs index 01cb31d2f24..5b3db0ffc83 100644 --- a/tests/it/compute/cast.rs +++ b/tests/it/compute/cast.rs @@ -125,7 +125,7 @@ fn i32_to_list_i32() { let array = Int32Array::from_slice([5, 6, 7, 8, 9]); let b = cast( &array, - &DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + &DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, true))), CastOptions::default(), ) .unwrap(); @@ -149,7 +149,7 @@ fn i32_to_list_i32_nullable() { let array = Int32Array::from(input); let b = cast( &array, - &DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + &DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, true))), CastOptions::default(), ) .unwrap(); @@ -173,7 +173,7 @@ fn i32_to_list_f64_nullable_sliced() { let array = array.sliced(2, 4); let b = cast( &array, - &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + &DataType::List(std::sync::Arc::new(Field::new("item", DataType::Float64, true))), CastOptions::default(), ) .unwrap(); @@ -502,8 +502,8 @@ fn consistency() { Duration(TimeUnit::Millisecond), Duration(TimeUnit::Microsecond), Duration(TimeUnit::Nanosecond), - List(Box::new(Field::new("a", Utf8, true))), - LargeList(Box::new(Field::new("a", Utf8, true))), + List(std::sync::Arc::new(Field::new("a", Utf8, true))), + LargeList(std::sync::Arc::new(Field::new("a", Utf8, true))), ]; for d1 in &datatypes { for d2 in &datatypes { diff --git a/tests/it/compute/filter.rs b/tests/it/compute/filter.rs index 08a7f6cbcad..d1037643ef5 100644 --- a/tests/it/compute/filter.rs +++ b/tests/it/compute/filter.rs @@ -180,7 +180,7 @@ fn list_array() { let value_offsets = Buffer::from_slice_ref(&[0i64, 3, 6, 8, 8]); let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); + DataType::LargeList(std::sync::Arc::new(Field::new("item", DataType::Int32, false))); let list_data = ArrayData::builder(list_data_type) .len(4) .add_buffer(value_offsets) @@ -202,7 +202,7 @@ fn list_array() { let value_offsets = Buffer::from_slice_ref(&[0i64, 3, 3]); let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); + DataType::LargeList(std::sync::Arc::new(Field::new("item", DataType::Int32, false))); let expected = ArrayData::builder(list_data_type) .len(2) .add_buffer(value_offsets) diff --git a/tests/it/ffi/data.rs b/tests/it/ffi/data.rs index e5675ac60fe..aedb23bc112 100644 --- a/tests/it/ffi/data.rs +++ b/tests/it/ffi/data.rs @@ -210,7 +210,7 @@ fn list_sliced() -> Result<()> { let bitmap = Bitmap::from([true, false, false, true]).sliced(1, 3); let array = ListArray::::try_new( - DataType::List(Box::new(Field::new("a", DataType::Int32, true))), + DataType::List(std::sync::Arc::new(Field::new("a", DataType::Int32, true))), vec![0, 1, 1, 2].try_into().unwrap(), Box::new(PrimitiveArray::::from_slice([1, 2])), Some(bitmap), @@ -256,7 +256,7 @@ fn fixed_size_list_sliced() -> Result<()> { let bitmap = Bitmap::from([true, false, false, true]).sliced(1, 3); let array = FixedSizeListArray::try_new( - DataType::FixedSizeList(Box::new(Field::new("a", DataType::Int32, true)), 2), + DataType::FixedSizeList(std::sync::Arc::new(Field::new("a", DataType::Int32, true)), 2), Box::new(PrimitiveArray::::from_vec(vec![1, 2, 3, 4, 5, 6])), Some(bitmap), )?; @@ -312,7 +312,7 @@ fn dict() -> Result<()> { fn schema() -> Result<()> { let field = Field::new( "a", - DataType::List(Box::new(Field::new("a", DataType::UInt32, true))), + DataType::List(std::sync::Arc::new(Field::new("a", DataType::UInt32, true))), true, ); test_round_trip_schema(field)?; diff --git a/tests/it/io/avro/read.rs b/tests/it/io/avro/read.rs index 90aefbf4240..88125087e09 100644 --- a/tests/it/io/avro/read.rs +++ b/tests/it/io/avro/read.rs @@ -73,7 +73,7 @@ pub(super) fn schema() -> (AvroSchema, Schema) { Field::new("g", DataType::Utf8, true), Field::new( "h", - DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, true))), false, ), Field::new( @@ -331,7 +331,7 @@ fn schema_list() -> (AvroSchema, Schema) { let schema = Schema::from(vec![Field::new( "h", - DataType::List(Box::new(Field::new("item", DataType::Int32, false))), + DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, false))), false, )]); @@ -343,7 +343,7 @@ pub(super) fn data_list() -> Chunk> { let mut array = MutableListArray::>::new_from( Default::default(), - DataType::List(Box::new(Field::new("item", DataType::Int32, false))), + DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, false))), 0, ); array.try_extend(data).unwrap(); diff --git a/tests/it/io/avro/write.rs b/tests/it/io/avro/write.rs index 7cff7740fbb..5e995e7a095 100644 --- a/tests/it/io/avro/write.rs +++ b/tests/it/io/avro/write.rs @@ -42,20 +42,20 @@ pub(super) fn schema() -> Schema { ), Field::new( "list", - DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, true))), false, ), Field::new( "list nullable", - DataType::List(Box::new(Field::new("item", DataType::Int32, true))), + DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, true))), true, ), ]) } pub(super) fn data() -> Chunk> { - let list_dt = DataType::List(Box::new(Field::new("item", DataType::Int32, true))); - let list_dt1 = DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + let list_dt = DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, true))); + let list_dt1 = DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, true))); let columns = vec![ Box::new(Int64Array::from_slice([27, 47])) as Box, diff --git a/tests/it/io/json/read.rs b/tests/it/io/json/read.rs index 2f780693965..bdebaf7155c 100644 --- a/tests/it/io/json/read.rs +++ b/tests/it/io/json/read.rs @@ -170,7 +170,7 @@ fn deserialize_timestamp_string_ns() -> Result<()> { let json = json_deserializer::parse(data)?; - let data_type = DataType::List(Box::new(Field::new( + let data_type = DataType::List(std::sync::Arc::new(Field::new( "item", DataType::Timestamp(TimeUnit::Nanosecond, None), false, @@ -192,7 +192,7 @@ fn deserialize_timestamp_string_us() -> Result<()> { let json = json_deserializer::parse(data)?; - let data_type = DataType::List(Box::new(Field::new( + let data_type = DataType::List(std::sync::Arc::new(Field::new( "item", DataType::Timestamp(TimeUnit::Microsecond, None), false, @@ -214,7 +214,7 @@ fn deserialize_timestamp_string_ms() -> Result<()> { let json = json_deserializer::parse(data)?; - let data_type = DataType::List(Box::new(Field::new( + let data_type = DataType::List(std::sync::Arc::new(Field::new( "item", DataType::Timestamp(TimeUnit::Millisecond, None), false, @@ -236,7 +236,7 @@ fn deserialize_timestamp_string_s() -> Result<()> { let json = json_deserializer::parse(data)?; - let data_type = DataType::List(Box::new(Field::new( + let data_type = DataType::List(std::sync::Arc::new(Field::new( "item", DataType::Timestamp(TimeUnit::Second, None), false, @@ -258,7 +258,7 @@ fn deserialize_timestamp_string_tz_s() -> Result<()> { let json = json_deserializer::parse(data)?; - let data_type = DataType::List(Box::new(Field::new( + let data_type = DataType::List(std::sync::Arc::new(Field::new( "item", DataType::Timestamp(TimeUnit::Second, Some("+01:00".to_string())), false, @@ -282,7 +282,7 @@ fn deserialize_timestamp_int_ns() -> Result<()> { let json = json_deserializer::parse(data)?; - let data_type = DataType::List(Box::new(Field::new( + let data_type = DataType::List(std::sync::Arc::new(Field::new( "item", DataType::Timestamp(TimeUnit::Nanosecond, None), false, diff --git a/tests/it/io/json/write.rs b/tests/it/io/json/write.rs index 9c8d1313f23..ba07cf33298 100644 --- a/tests/it/io/json/write.rs +++ b/tests/it/io/json/write.rs @@ -326,7 +326,7 @@ fn list_of_struct() -> Result<()> { Field::new("c11", DataType::Int32, false), Field::new("c12", DataType::Struct(inner.clone()), false), ]; - let c1_datatype = DataType::List(Box::new(Field::new( + let c1_datatype = DataType::List(std::sync::Arc::new(Field::new( "s", DataType::Struct(fields.clone()), false, diff --git a/tests/it/io/ndjson/mod.rs b/tests/it/io/ndjson/mod.rs index bd5626b1cbd..f11e15b1ed0 100644 --- a/tests/it/io/ndjson/mod.rs +++ b/tests/it/io/ndjson/mod.rs @@ -50,12 +50,12 @@ fn case_list() -> (String, Box) { Field::new("a", DataType::Int64, true), Field::new( "b", - DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + DataType::List(std::sync::Arc::new(Field::new("item", DataType::Float64, true))), true, ), Field::new( "c", - DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + DataType::List(std::sync::Arc::new(Field::new("item", DataType::Boolean, true))), true, ), Field::new("d", DataType::Utf8, true), @@ -100,7 +100,7 @@ fn case_dict() -> (String, Box) { "# .to_string(); - let data_type = DataType::List(Box::new(Field::new( + let data_type = DataType::List(std::sync::Arc::new(Field::new( "item", DataType::Dictionary(u64::KEY_TYPE, Box::new(DataType::Utf8), false), true, @@ -235,7 +235,7 @@ fn case_nested_list() -> (String, Box) { DataType::Struct(vec![b_field.clone(), c_field.clone()]), true, ); - let a_list_data_type = DataType::List(Box::new(a_struct_field)); + let a_list_data_type = DataType::List(std::sync::Arc::new(a_struct_field)); let a_field = Field::new("a", a_list_data_type.clone(), true); let data = r#" diff --git a/tests/it/io/ndjson/read.rs b/tests/it/io/ndjson/read.rs index 2c8872ce1a0..82553ef36d2 100644 --- a/tests/it/io/ndjson/read.rs +++ b/tests/it/io/ndjson/read.rs @@ -172,12 +172,12 @@ fn infer_schema_mixed_list() -> Result<()> { Field::new("a", DataType::Int64, true), Field::new( "b", - DataType::List(Box::new(Field::new("item", DataType::Float64, true))), + DataType::List(std::sync::Arc::new(Field::new("item", DataType::Float64, true))), true, ), Field::new( "c", - DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + DataType::List(std::sync::Arc::new(Field::new("item", DataType::Boolean, true))), true, ), Field::new("d", DataType::Utf8, true), diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index cdf5b41573a..3d8a99b3fcc 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -1,5 +1,6 @@ use ethnum::AsI256; use std::io::{Cursor, Read, Seek}; +use std::sync::Arc; use arrow2::types::i256; use arrow2::{ @@ -97,7 +98,11 @@ pub fn pyarrow_nested_edge(column: &str) -> Box { // {"f1": ["a", "b", None, "c"]} // ] let a = ListArray::::new( - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::Utf8, + true, + ))), vec![0, 4].try_into().unwrap(), Utf8Array::::from([Some("a"), Some("b"), None, Some("c")]).boxed(), None, @@ -112,7 +117,7 @@ pub fn pyarrow_nested_edge(column: &str) -> Box { "list_struct_list_nullable" => { let values = pyarrow_nested_edge("struct_list_nullable"); ListArray::::new( - DataType::List(Box::new(Field::new( + DataType::List(std::sync::Arc::new(Field::new( "item", values.data_type().clone(), true, @@ -306,7 +311,7 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { .boxed(); let array = ListArray::::new( - DataType::List(Box::new(Field::new( + DataType::List(std::sync::Arc::new(Field::new( "item", array.data_type().clone(), true, @@ -346,12 +351,20 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { match column { "list_int64_required_required" => { // [[0, 1], [], [2, 0, 3], [4, 5, 6], [], [7, 8, 9], [], [10]] - let data_type = DataType::List(Box::new(Field::new("item", DataType::Int64, false))); + let data_type = DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::Int64, + false, + ))); ListArray::::new(data_type, offsets, values, None).boxed() } "list_int64_optional_required" => { // [[0, 1], [], [2, 0, 3], [4, 5, 6], [], [7, 8, 9], [], [10]] - let data_type = DataType::List(Box::new(Field::new("item", DataType::Int64, true))); + let data_type = DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::Int64, + true, + ))); ListArray::::new(data_type, offsets, values, None).boxed() } "list_nested_i64" => { @@ -429,7 +442,7 @@ pub fn pyarrow_nested_nullable(column: &str) -> Box { ])); // [0, 2, 2, 5, 8, 8, 11, 11, 12] // [[a1, a2], None, [a3, a4, a5], [a6, a7, a8], [], [a9, a10, a11], None, [a12]] - let data_type = DataType::List(Box::new(field)); + let data_type = DataType::List(std::sync::Arc::new(field)); ListArray::::new(data_type, offsets, values, validity).boxed() } } @@ -829,7 +842,7 @@ pub fn pyarrow_required_statistics(column: &str) -> Statistics { pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { let new_list = |array: Box, nullable: bool| { ListArray::::new( - DataType::List(Box::new(Field::new( + DataType::List(std::sync::Arc::new(Field::new( "item", array.data_type().clone(), nullable, @@ -1042,7 +1055,7 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics { pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics { let new_list = |array: Box| { ListArray::::new( - DataType::List(Box::new(Field::new( + DataType::List(std::sync::Arc::new(Field::new( "item", array.data_type().clone(), true, @@ -1373,7 +1386,10 @@ pub fn pyarrow_map(column: &str) -> Box { Field::new("value", DataType::Utf8, true), ]); MapArray::try_new( - DataType::Map(Box::new(Field::new("entries", dt.clone(), false)), false), + DataType::Map( + std::sync::Arc::new(Field::new("entries", dt.clone(), false)), + false, + ), vec![0, 2].try_into().unwrap(), StructArray::try_new( dt, @@ -1398,7 +1414,10 @@ pub fn pyarrow_map(column: &str) -> Box { Field::new("value", DataType::Utf8, true), ]); MapArray::try_new( - DataType::Map(Box::new(Field::new("entries", dt.clone(), false)), false), + DataType::Map( + std::sync::Arc::new(Field::new("entries", dt.clone(), false)), + false, + ), vec![0, 2].try_into().unwrap(), StructArray::try_new( dt, @@ -1428,7 +1447,7 @@ pub fn pyarrow_map_statistics(column: &str) -> Statistics { .collect::>(); MapArray::new( DataType::Map( - Box::new(Field::new( + Arc::new(Field::new( "entries", DataType::Struct(fields.clone()), false, @@ -1931,7 +1950,7 @@ fn nested_dict_data(data_type: DataType) -> Result<(Schema, Chunk let indices = PrimitiveArray::from_values((0..3u64).map(|x| x % 2)); let values = DictionaryArray::try_from_keys(indices, values).unwrap(); let values = ListArray::try_new( - DataType::List(Box::new(Field::new( + DataType::List(std::sync::Arc::new(Field::new( "item", values.data_type().clone(), false, diff --git a/tests/it/scalar/fixed_size_list.rs b/tests/it/scalar/fixed_size_list.rs index 89809d343a2..ef8eddffb95 100644 --- a/tests/it/scalar/fixed_size_list.rs +++ b/tests/it/scalar/fixed_size_list.rs @@ -7,7 +7,7 @@ use arrow2::{ #[allow(clippy::eq_op)] #[test] fn equal() { - let dt = DataType::FixedSizeList(Box::new(Field::new("a", DataType::Boolean, true)), 2); + let dt = DataType::FixedSizeList(std::sync::Arc::new(Field::new("a", DataType::Boolean, true)), 2); let a = FixedSizeListScalar::new( dt.clone(), Some(BooleanArray::from_slice([true, false]).boxed()), @@ -26,7 +26,7 @@ fn equal() { #[test] fn basics() { - let dt = DataType::FixedSizeList(Box::new(Field::new("a", DataType::Boolean, true)), 2); + let dt = DataType::FixedSizeList(std::sync::Arc::new(Field::new("a", DataType::Boolean, true)), 2); let a = FixedSizeListScalar::new( dt.clone(), Some(BooleanArray::from_slice([true, false]).boxed()), diff --git a/tests/it/scalar/list.rs b/tests/it/scalar/list.rs index d8954e6bba0..44e6e32b26e 100644 --- a/tests/it/scalar/list.rs +++ b/tests/it/scalar/list.rs @@ -7,7 +7,7 @@ use arrow2::{ #[allow(clippy::eq_op)] #[test] fn equal() { - let dt = DataType::List(Box::new(Field::new("a", DataType::Boolean, true))); + let dt = DataType::List(std::sync::Arc::new(Field::new("a", DataType::Boolean, true))); let a = ListScalar::::new( dt.clone(), Some(BooleanArray::from_slice([true, false]).boxed()), @@ -23,7 +23,7 @@ fn equal() { #[test] fn basics() { - let dt = DataType::List(Box::new(Field::new("a", DataType::Boolean, true))); + let dt = DataType::List(std::sync::Arc::new(Field::new("a", DataType::Boolean, true))); let a = ListScalar::::new( dt.clone(), Some(BooleanArray::from_slice([true, false]).boxed()), diff --git a/tests/it/scalar/map.rs b/tests/it/scalar/map.rs index 1a232a5049c..1fb29eeb628 100644 --- a/tests/it/scalar/map.rs +++ b/tests/it/scalar/map.rs @@ -30,7 +30,7 @@ fn equal() { ) .unwrap(); - let dt = DataType::Map(Box::new(Field::new("entries", kv_dt, true)), false); + let dt = DataType::Map(std::sync::Arc::new(Field::new("entries", kv_dt, true)), false); let a = MapScalar::new(dt.clone(), Some(Box::new(kv_array1))); let b = MapScalar::new(dt.clone(), None); assert_eq!(a, a); @@ -57,7 +57,7 @@ fn basics() { ) .unwrap(); - let dt = DataType::Map(Box::new(Field::new("entries", kv_dt, true)), false); + let dt = DataType::Map(std::sync::Arc::new(Field::new("entries", kv_dt, true)), false); let a = MapScalar::new(dt.clone(), Some(Box::new(kv_array.clone()))); assert_eq!(kv_array, a.values().as_ref()); From c1e58ad8db4063e81809311249ee0db4c853e3a9 Mon Sep 17 00:00:00 2001 From: Clement Rey Date: Thu, 13 Apr 2023 17:21:13 +0200 Subject: [PATCH 2/4] Taking care of Timestamp in the most pain-free way I could come up with --- src/compute/arithmetics/time.rs | 8 +- src/compute/cast/primitive_to.rs | 3 +- src/compute/cast/utf8_to.rs | 6 +- src/datatypes/mod.rs | 8 +- src/ffi/schema.rs | 28 +++++-- src/io/avro/read/schema.rs | 6 +- src/io/csv/utils.rs | 17 ++-- src/io/ipc/read/schema.rs | 4 +- src/io/ipc/write/schema.rs | 2 +- src/io/json_integration/read/schema.rs | 2 +- src/io/parquet/read/schema/convert.rs | 109 ++++++++++++++++++++----- src/temporal_conversions.rs | 12 +-- tests/it/array/primitive/fmt.rs | 8 +- tests/it/arrow.rs | 4 +- tests/it/compute/arithmetics/time.rs | 42 +++++----- tests/it/compute/cast.rs | 44 ++++++++-- tests/it/compute/temporal.rs | 20 +++-- tests/it/ffi/data.rs | 8 +- tests/it/io/csv/read.rs | 4 +- tests/it/io/csv/write.rs | 8 +- tests/it/io/json/read.rs | 4 +- tests/it/io/parquet/mod.rs | 8 +- tests/it/io/print.rs | 2 +- tests/it/temporal_conversions.rs | 5 +- 24 files changed, 250 insertions(+), 112 deletions(-) diff --git a/src/compute/arithmetics/time.rs b/src/compute/arithmetics/time.rs index e049b3820b5..1cb0446fba7 100644 --- a/src/compute/arithmetics/time.rs +++ b/src/compute/arithmetics/time.rs @@ -81,7 +81,7 @@ fn create_scale(lhs: &DataType, rhs: &DataType) -> Result { /// ]) /// .to(DataType::Timestamp( /// TimeUnit::Second, -/// Some("America/New_York".to_string()), +/// Some(std::sync::Arc::new("America/New_york".to_string())), /// )); /// /// let duration = PrimitiveArray::from([Some(10i64), Some(20i64), None, Some(30i64)]) @@ -96,7 +96,7 @@ fn create_scale(lhs: &DataType, rhs: &DataType) -> Result { /// ]) /// .to(DataType::Timestamp( /// TimeUnit::Second, -/// Some("America/New_York".to_string()), +/// Some(std::sync::Arc::new("America/New_york".to_string())), /// )); /// /// assert_eq!(result, expected); @@ -161,7 +161,7 @@ where /// ]) /// .to(DataType::Timestamp( /// TimeUnit::Second, -/// Some("America/New_York".to_string()), +/// Some(std::sync::Arc::new("America/New_york".to_string())), /// )); /// /// let duration = PrimitiveArray::from([Some(10i64), Some(20i64), None, Some(30i64)]) @@ -176,7 +176,7 @@ where /// ]) /// .to(DataType::Timestamp( /// TimeUnit::Second, -/// Some("America/New_York".to_string()), +/// Some(std::sync::Arc::new("America/New_york".to_string())), /// )); /// /// assert_eq!(result, expected); diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs index 585e826cddb..6a5d58cbc02 100644 --- a/src/compute/cast/primitive_to.rs +++ b/src/compute/cast/primitive_to.rs @@ -1,4 +1,5 @@ use std::hash::Hash; +use std::sync::Arc; use num_traits::{AsPrimitive, Float, ToPrimitive}; @@ -406,7 +407,7 @@ pub fn timestamp_to_timestamp( from: &PrimitiveArray, from_unit: TimeUnit, to_unit: TimeUnit, - tz: &Option, + tz: &Option>, ) -> PrimitiveArray { let from_size = time_unit_multiple(from_unit); let to_size = time_unit_multiple(to_unit); diff --git a/src/compute/cast/utf8_to.rs b/src/compute/cast/utf8_to.rs index 6ee38588696..610d1c11e7f 100644 --- a/src/compute/cast/utf8_to.rs +++ b/src/compute/cast/utf8_to.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use chrono::Datelike; use crate::{ @@ -127,7 +129,7 @@ pub fn utf8_to_naive_timestamp_ns(from: &Utf8Array) -> PrimitiveAr pub(super) fn utf8_to_timestamp_ns_dyn( from: &dyn Array, - timezone: String, + timezone: Arc, ) -> Result> { let from = from.as_any().downcast_ref().unwrap(); utf8_to_timestamp_ns::(from, timezone) @@ -138,7 +140,7 @@ pub(super) fn utf8_to_timestamp_ns_dyn( /// [`crate::temporal_conversions::utf8_to_timestamp_ns`] applied for RFC3339 formatting pub fn utf8_to_timestamp_ns( from: &Utf8Array, - timezone: String, + timezone: Arc, ) -> Result> { utf8_to_timestamp_ns_(from, RFC3339, timezone) } diff --git a/src/datatypes/mod.rs b/src/datatypes/mod.rs index 8866446b798..dc4f187a57e 100644 --- a/src/datatypes/mod.rs +++ b/src/datatypes/mod.rs @@ -111,7 +111,7 @@ pub enum DataType { /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 /// When the timezone is not specified, the timestamp is considered to have no timezone /// and is represented _as is_ - Timestamp(TimeUnit, Option), + Timestamp(TimeUnit, Option>), /// An [`i32`] representing the elapsed time since UNIX epoch (1970-01-01) /// in days. Date32, @@ -218,7 +218,9 @@ impl From for arrow_schema::DataType { DataType::Float16 => Self::Float16, DataType::Float32 => Self::Float32, DataType::Float64 => Self::Float64, - DataType::Timestamp(unit, tz) => Self::Timestamp(unit.into(), tz), + DataType::Timestamp(unit, tz) => { + Self::Timestamp(unit.into(), tz.map(Arc::unwrap_or_clone_polyfill)) + } DataType::Date32 => Self::Date32, DataType::Date64 => Self::Date64, DataType::Time32(unit) => Self::Time32(unit.into()), @@ -280,7 +282,7 @@ impl From for DataType { DataType::Float16 => Self::Float16, DataType::Float32 => Self::Float32, DataType::Float64 => Self::Float64, - DataType::Timestamp(unit, tz) => Self::Timestamp(unit.into(), tz), + DataType::Timestamp(unit, tz) => Self::Timestamp(unit.into(), tz.map(Arc::new)), DataType::Date32 => Self::Date32, DataType::Date64 => Self::Date64, DataType::Time32(unit) => Self::Time32(unit.into()), diff --git a/src/ffi/schema.rs b/src/ffi/schema.rs index 2e86dea49fe..b36addc21ff 100644 --- a/src/ffi/schema.rs +++ b/src/ffi/schema.rs @@ -287,10 +287,18 @@ unsafe fn to_data_type(schema: &ArrowSchema) -> Result { ["tsn", ""] => DataType::Timestamp(TimeUnit::Nanosecond, None), // Timestamps with timezone - ["tss", tz] => DataType::Timestamp(TimeUnit::Second, Some(tz.to_string())), - ["tsm", tz] => DataType::Timestamp(TimeUnit::Millisecond, Some(tz.to_string())), - ["tsu", tz] => DataType::Timestamp(TimeUnit::Microsecond, Some(tz.to_string())), - ["tsn", tz] => DataType::Timestamp(TimeUnit::Nanosecond, Some(tz.to_string())), + ["tss", tz] => { + DataType::Timestamp(TimeUnit::Second, Some(Arc::new(tz.to_string()))) + } + ["tsm", tz] => { + DataType::Timestamp(TimeUnit::Millisecond, Some(Arc::new(tz.to_string()))) + } + ["tsu", tz] => { + DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::new(tz.to_string()))) + } + ["tsn", tz] => { + DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::new(tz.to_string()))) + } ["w", size_raw] => { // Example: "w:42" fixed-width binary [42 bytes] @@ -433,7 +441,7 @@ fn to_format(data_type: &DataType) -> String { format!( "ts{}:{}", unit, - tz.as_ref().map(|x| x.as_ref()).unwrap_or("") + tz.as_ref().map(|x| x.as_str()).unwrap_or("") ) } DataType::Decimal(precision, scale) => format!("d:{precision},{scale}"), @@ -576,7 +584,10 @@ mod tests { true, ), ]), - DataType::Map(std::sync::Arc::new(Field::new("a", DataType::Int64, true)), true), + DataType::Map( + std::sync::Arc::new(Field::new("a", DataType::Int64, true)), + true, + ), DataType::Union( vec![ Field::new("a", DataType::Int64, true), @@ -609,7 +620,10 @@ mod tests { TimeUnit::Nanosecond, ] { dts.push(DataType::Timestamp(time_unit, None)); - dts.push(DataType::Timestamp(time_unit, Some("00:00".to_string()))); + dts.push(DataType::Timestamp( + time_unit, + Some(Arc::new("00:00".to_string())), + )); dts.push(DataType::Duration(time_unit)); } for interval_type in [ diff --git a/src/io/avro/read/schema.rs b/src/io/avro/read/schema.rs index 5999503941f..07b988fd71e 100644 --- a/src/io/avro/read/schema.rs +++ b/src/io/avro/read/schema.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use avro_schema::schema::{Enum, Fixed, Record, Schema as AvroSchema}; use crate::datatypes::*; @@ -52,10 +54,10 @@ fn schema_to_field(schema: &AvroSchema, name: Option<&str>, props: Metadata) -> Some(logical) => match logical { avro_schema::schema::LongLogical::Time => DataType::Time64(TimeUnit::Microsecond), avro_schema::schema::LongLogical::TimestampMillis => { - DataType::Timestamp(TimeUnit::Millisecond, Some("00:00".to_string())) + DataType::Timestamp(TimeUnit::Millisecond, Some(Arc::new("00:00".to_string()))) } avro_schema::schema::LongLogical::TimestampMicros => { - DataType::Timestamp(TimeUnit::Microsecond, Some("00:00".to_string())) + DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::new("00:00".to_string()))) } avro_schema::schema::LongLogical::LocalTimestampMillis => { DataType::Timestamp(TimeUnit::Millisecond, None) diff --git a/src/io/csv/utils.rs b/src/io/csv/utils.rs index fa8c01f43d7..7cf8d66595a 100644 --- a/src/io/csv/utils.rs +++ b/src/io/csv/utils.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use crate::datatypes::{DataType, Field, TimeUnit}; use ahash::AHashSet; @@ -27,15 +29,18 @@ fn is_naive_datetime(string: &str) -> bool { string.parse::().is_ok() } -fn is_datetime(string: &str) -> Option { +fn is_datetime(string: &str) -> Option> { let mut parsed = chrono::format::Parsed::new(); let fmt = chrono::format::StrftimeItems::new(RFC3339); if chrono::format::parse(&mut parsed, string, fmt).is_ok() { - parsed.offset.map(|x| { - let hours = x / 60 / 60; - let minutes = x / 60 - hours * 60; - format!("{hours:03}:{minutes:02}") - }) + parsed + .offset + .map(|x| { + let hours = x / 60 / 60; + let minutes = x / 60 - hours * 60; + format!("{hours:03}:{minutes:02}") + }) + .map(Arc::new) } else { None } diff --git a/src/io/ipc/read/schema.rs b/src/io/ipc/read/schema.rs index eb71cee1ea5..b625f19d484 100644 --- a/src/io/ipc/read/schema.rs +++ b/src/io/ipc/read/schema.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow_format::ipc::{ planus::ReadAsRoot, FieldRef, FixedSizeListRef, MapRef, TimeRef, TimestampRef, UnionRef, }; @@ -104,7 +106,7 @@ fn deserialize_time(time: TimeRef) -> Result<(DataType, IpcField)> { } fn deserialize_timestamp(timestamp: TimestampRef) -> Result<(DataType, IpcField)> { - let timezone = timestamp.timezone()?.map(|tz| tz.to_string()); + let timezone = timestamp.timezone()?.map(|tz| Arc::new(tz.to_string())); let time_unit = deserialize_timeunit(timestamp.unit()?)?; Ok(( DataType::Timestamp(time_unit, timezone), diff --git a/src/io/ipc/write/schema.rs b/src/io/ipc/write/schema.rs index 1c4dab8e393..5c35c8104f3 100644 --- a/src/io/ipc/write/schema.rs +++ b/src/io/ipc/write/schema.rs @@ -228,7 +228,7 @@ fn serialize_type(data_type: &DataType) -> arrow_format::ipc::Type { })), Timestamp(unit, tz) => ipc::Type::Timestamp(Box::new(ipc::Timestamp { unit: serialize_time_unit(unit), - timezone: tz.as_ref().cloned(), + timezone: tz.as_ref().map(|tz| tz.as_str().to_owned()), })), Interval(unit) => ipc::Type::Interval(Box::new(ipc::Interval { unit: match unit { diff --git a/src/io/json_integration/read/schema.rs b/src/io/json_integration/read/schema.rs index e127b4852d2..66b88f1f8b7 100644 --- a/src/io/json_integration/read/schema.rs +++ b/src/io/json_integration/read/schema.rs @@ -211,7 +211,7 @@ fn to_data_type(item: &Value, mut children: Vec) -> Result { Some(Value::String(tz)) => Ok(Some(tz.clone())), _ => Err(Error::OutOfSpec("timezone must be a string".to_string())), }?; - DataType::Timestamp(unit, tz) + DataType::Timestamp(unit, tz.map(Arc::new)) } "date" => match item.get("unit") { Some(p) if p == "DAY" => DataType::Date32, diff --git a/src/io/parquet/read/schema/convert.rs b/src/io/parquet/read/schema/convert.rs index c795f724232..1d6442ca52a 100644 --- a/src/io/parquet/read/schema/convert.rs +++ b/src/io/parquet/read/schema/convert.rs @@ -1,4 +1,6 @@ //! This module has a single entry point, [`parquet_to_arrow_schema`]. +use std::sync::Arc; + use parquet2::schema::{ types::{ FieldInfo, GroupConvertedType, GroupLogicalType, IntegerType, ParquetType, PhysicalType, @@ -95,12 +97,14 @@ fn from_int64( match unit { ParquetTimeUnit::Milliseconds => { - DataType::Timestamp(TimeUnit::Millisecond, timezone) + DataType::Timestamp(TimeUnit::Millisecond, timezone.map(Arc::new)) } ParquetTimeUnit::Microseconds => { - DataType::Timestamp(TimeUnit::Microsecond, timezone) + DataType::Timestamp(TimeUnit::Microsecond, timezone.map(Arc::new)) + } + ParquetTimeUnit::Nanoseconds => { + DataType::Timestamp(TimeUnit::Nanosecond, timezone.map(Arc::new)) } - ParquetTimeUnit::Nanoseconds => DataType::Timestamp(TimeUnit::Nanosecond, timezone), } } (Some(Time { unit, .. }), _) => match unit { @@ -521,7 +525,11 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(std::sync::Arc::new(Field::new("element", DataType::Utf8, true))), + DataType::List(std::sync::Arc::new(Field::new( + "element", + DataType::Utf8, + true, + ))), false, )); } @@ -535,7 +543,11 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(std::sync::Arc::new(Field::new("element", DataType::Utf8, false))), + DataType::List(std::sync::Arc::new(Field::new( + "element", + DataType::Utf8, + false, + ))), true, )); } @@ -553,11 +565,18 @@ mod tests { // } // } { - let arrow_inner_list = - DataType::List(std::sync::Arc::new(Field::new("element", DataType::Int32, false))); + let arrow_inner_list = DataType::List(std::sync::Arc::new(Field::new( + "element", + DataType::Int32, + false, + ))); arrow_fields.push(Field::new( "array_of_arrays", - DataType::List(std::sync::Arc::new(Field::new("element", arrow_inner_list, false))), + DataType::List(std::sync::Arc::new(Field::new( + "element", + arrow_inner_list, + false, + ))), true, )); } @@ -571,7 +590,11 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(std::sync::Arc::new(Field::new("element", DataType::Utf8, true))), + DataType::List(std::sync::Arc::new(Field::new( + "element", + DataType::Utf8, + true, + ))), true, )); } @@ -583,7 +606,11 @@ mod tests { { arrow_fields.push(Field::new( "my_list", - DataType::List(std::sync::Arc::new(Field::new("element", DataType::Int32, true))), + DataType::List(std::sync::Arc::new(Field::new( + "element", + DataType::Int32, + true, + ))), true, )); } @@ -602,7 +629,11 @@ mod tests { ]); arrow_fields.push(Field::new( "my_list", - DataType::List(std::sync::Arc::new(Field::new("element", arrow_struct, true))), + DataType::List(std::sync::Arc::new(Field::new( + "element", + arrow_struct, + true, + ))), true, )); } @@ -634,7 +665,11 @@ mod tests { let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); arrow_fields.push(Field::new( "my_list", - DataType::List(std::sync::Arc::new(Field::new("my_list_tuple", arrow_struct, true))), + DataType::List(std::sync::Arc::new(Field::new( + "my_list_tuple", + arrow_struct, + true, + ))), true, )); } @@ -644,7 +679,11 @@ mod tests { { arrow_fields.push(Field::new( "name", - DataType::List(std::sync::Arc::new(Field::new("name", DataType::Int32, true))), + DataType::List(std::sync::Arc::new(Field::new( + "name", + DataType::Int32, + true, + ))), true, )); } @@ -689,7 +728,11 @@ mod tests { { arrow_fields.push(Field::new( "my_list1", - DataType::List(std::sync::Arc::new(Field::new("element", DataType::Utf8, true))), + DataType::List(std::sync::Arc::new(Field::new( + "element", + DataType::Utf8, + true, + ))), false, )); } @@ -703,7 +746,11 @@ mod tests { { arrow_fields.push(Field::new( "my_list2", - DataType::List(std::sync::Arc::new(Field::new("element", DataType::Utf8, false))), + DataType::List(std::sync::Arc::new(Field::new( + "element", + DataType::Utf8, + false, + ))), true, )); } @@ -717,7 +764,11 @@ mod tests { { arrow_fields.push(Field::new( "my_list3", - DataType::List(std::sync::Arc::new(Field::new("element", DataType::Utf8, false))), + DataType::List(std::sync::Arc::new(Field::new( + "element", + DataType::Utf8, + false, + ))), false, )); } @@ -848,7 +899,11 @@ mod tests { Field::new("string", DataType::Utf8, true), Field::new( "bools", - DataType::List(std::sync::Arc::new(Field::new("bools", DataType::Boolean, true))), + DataType::List(std::sync::Arc::new(Field::new( + "bools", + DataType::Boolean, + true, + ))), true, ), Field::new("date", DataType::Date32, true), @@ -867,7 +922,7 @@ mod tests { ), Field::new( "ts_nano", - DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".to_string())), + DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::new("+00:00".to_string()))), false, ), ]; @@ -930,12 +985,20 @@ mod tests { Field::new("string", DataType::Utf8, true), Field::new( "bools", - DataType::List(std::sync::Arc::new(Field::new("element", DataType::Boolean, true))), + DataType::List(std::sync::Arc::new(Field::new( + "element", + DataType::Boolean, + true, + ))), true, ), Field::new( "bools_non_null", - DataType::List(std::sync::Arc::new(Field::new("element", DataType::Boolean, false))), + DataType::List(std::sync::Arc::new(Field::new( + "element", + DataType::Boolean, + false, + ))), false, ), Field::new("date", DataType::Date32, true), @@ -958,7 +1021,11 @@ mod tests { Field::new("uint32", DataType::UInt32, false), Field::new( "int32", - DataType::List(std::sync::Arc::new(Field::new("element", DataType::Int32, true))), + DataType::List(std::sync::Arc::new(Field::new( + "element", + DataType::Int32, + true, + ))), false, ), ]), diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs index fb06b534957..080c8c794e7 100644 --- a/src/temporal_conversions.rs +++ b/src/temporal_conversions.rs @@ -1,5 +1,7 @@ //! Conversion methods for dates and times. +use std::sync::Arc; + use chrono::{ format::{parse, Parsed, StrftimeItems}, Datelike, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, @@ -287,7 +289,7 @@ pub fn utf8_to_naive_timestamp_scalar(value: &str, fmt: &str, tu: &TimeUnit) -> fn utf8_to_timestamp_ns_impl( array: &Utf8Array, fmt: &str, - timezone: String, + timezone: Arc, tz: T, ) -> PrimitiveArray { let iter = array @@ -312,9 +314,9 @@ pub fn parse_offset_tz(timezone: &str) -> Result { fn chrono_tz_utf_to_timestamp_ns( array: &Utf8Array, fmt: &str, - timezone: String, + timezone: Arc, ) -> Result> { - let tz = parse_offset_tz(&timezone)?; + let tz = parse_offset_tz(timezone.as_str())?; Ok(utf8_to_timestamp_ns_impl(array, fmt, timezone, tz)) } @@ -322,7 +324,7 @@ fn chrono_tz_utf_to_timestamp_ns( fn chrono_tz_utf_to_timestamp_ns( _: &Utf8Array, _: &str, - timezone: String, + timezone: Arc, ) -> Result> { Err(Error::InvalidArgumentError(format!( "timezone \"{timezone}\" cannot be parsed (feature chrono-tz is not active)", @@ -340,7 +342,7 @@ fn chrono_tz_utf_to_timestamp_ns( pub fn utf8_to_timestamp_ns( array: &Utf8Array, fmt: &str, - timezone: String, + timezone: Arc, ) -> Result> { let tz = parse_offset(timezone.as_str()); diff --git a/tests/it/array/primitive/fmt.rs b/tests/it/array/primitive/fmt.rs index c04c1e1c5e5..54acc56bdad 100644 --- a/tests/it/array/primitive/fmt.rs +++ b/tests/it/array/primitive/fmt.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow2::{ array::*, datatypes::*, @@ -118,7 +120,7 @@ fn debug_timestamp_ns() { fn debug_timestamp_tz_ns() { let array = Int64Array::from(&[Some(1), None, Some(2)]).to(DataType::Timestamp( TimeUnit::Nanosecond, - Some("+02:00".to_string()), + Some(Arc::new("+02:00".to_string())), )); assert_eq!( format!("{array:?}"), @@ -130,7 +132,7 @@ fn debug_timestamp_tz_ns() { fn debug_timestamp_tz_not_parsable() { let array = Int64Array::from(&[Some(1), None, Some(2)]).to(DataType::Timestamp( TimeUnit::Nanosecond, - Some("aa".to_string()), + Some(Arc::new("aa".to_string())), )); assert_eq!( format!("{array:?}"), @@ -143,7 +145,7 @@ fn debug_timestamp_tz_not_parsable() { fn debug_timestamp_tz1_ns() { let array = Int64Array::from(&[Some(1), None, Some(2)]).to(DataType::Timestamp( TimeUnit::Nanosecond, - Some("Europe/Lisbon".to_string()), + Some(Arc::new("Europe/Lisbon".to_string())), )); assert_eq!( format!("{array:?}"), diff --git a/tests/it/arrow.rs b/tests/it/arrow.rs index 624b10cfc65..7f013bfe2db 100644 --- a/tests/it/arrow.rs +++ b/tests/it/arrow.rs @@ -92,7 +92,7 @@ fn test_primitive() { let array = PrimitiveArray::new(data_type, vec![1, 2, 3].into(), None); test_conversion(&array); - let data_type = DataType::Timestamp(TimeUnit::Second, Some("UTC".into())); + let data_type = DataType::Timestamp(TimeUnit::Second, Some(Arc::new("UTC".into()))); let nulls = Bitmap::from_iter([true, true, false]); let array = PrimitiveArray::new(data_type, vec![1_i64, 24, 0].into(), Some(nulls)); test_conversion(&array); @@ -136,7 +136,7 @@ fn make_struct() -> StructArray { let a1 = BinaryArray::::from_iter([Some("s".as_bytes()), Some(b"sd\xFFfk\x23"), None]); let a2 = BinaryArray::::from_iter([Some("45848".as_bytes()), Some(b"\x03\xFF"), None]); - let data_type = DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".into())); + let data_type = DataType::Timestamp(TimeUnit::Millisecond, Some(Arc::new("UTC".into()))); let nulls = Bitmap::from_iter([true, true, false]); let a3 = PrimitiveArray::new(data_type, vec![1_i64, 24, 0].into(), Some(nulls)); diff --git a/tests/it/compute/arithmetics/time.rs b/tests/it/compute/arithmetics/time.rs index 9f31dd60a97..6fdeec358c0 100644 --- a/tests/it/compute/arithmetics/time.rs +++ b/tests/it/compute/arithmetics/time.rs @@ -8,7 +8,7 @@ use arrow2::types::months_days_ns; fn test_adding_timestamp() { let timestamp = PrimitiveArray::from([Some(100000i64), Some(200000i64), None, Some(300000i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some("America/New_York".to_string())), + DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), ); let duration = PrimitiveArray::from([Some(10i64), Some(20i64), None, Some(30i64)]) @@ -17,7 +17,7 @@ fn test_adding_timestamp() { let result = add_duration(×tamp, &duration); let expected = PrimitiveArray::from([Some(100010i64), Some(200020i64), None, Some(300030i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some("America/New_York".to_string())), + DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), ); assert_eq!(result, expected); @@ -27,7 +27,7 @@ fn test_adding_timestamp() { let result = add_duration_scalar(×tamp, &duration); let expected = PrimitiveArray::from([Some(100010i64), Some(200010i64), None, Some(300010i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some("America/New_York".to_string())), + DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), ); assert_eq!(result, expected); } @@ -36,11 +36,11 @@ fn test_adding_timestamp() { fn test_adding_duration_different_scale() { let timestamp = PrimitiveArray::from([Some(100000i64), Some(200000i64), None, Some(300000i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some("America/New_York".to_string())), + DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), ); let expected = PrimitiveArray::from([Some(100010i64), Some(200020i64), None, Some(300030i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some("America/New_York".to_string())), + DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), ); // Testing duration in milliseconds @@ -69,20 +69,20 @@ fn test_adding_duration_different_scale() { #[test] fn test_adding_subtract_timestamps_scale() { let timestamp = PrimitiveArray::from([Some(10i64), Some(20i64), None, Some(30i64)]).to( - DataType::Timestamp(TimeUnit::Millisecond, Some("America/New_York".to_string())), + DataType::Timestamp(TimeUnit::Millisecond, Some(std::sync::Arc::new("America/New_york".to_string()))), ); let duration = PrimitiveArray::from([Some(1i64), Some(2i64), None, Some(3i64)]) .to(DataType::Duration(TimeUnit::Second)); let expected = PrimitiveArray::from([Some(1_010i64), Some(2_020i64), None, Some(3_030i64)]).to( - DataType::Timestamp(TimeUnit::Millisecond, Some("America/New_York".to_string())), + DataType::Timestamp(TimeUnit::Millisecond, Some(std::sync::Arc::new("America/New_york".to_string()))), ); let result = add_duration(×tamp, &duration); assert_eq!(result, expected); let timestamp = PrimitiveArray::from([Some(10i64), Some(20i64), None, Some(30i64)]).to( - DataType::Timestamp(TimeUnit::Nanosecond, Some("America/New_York".to_string())), + DataType::Timestamp(TimeUnit::Nanosecond, Some(std::sync::Arc::new("America/New_york".to_string()))), ); let duration = PrimitiveArray::from([Some(1i64), Some(2i64), None, Some(3i64)]) .to(DataType::Duration(TimeUnit::Second)); @@ -95,7 +95,7 @@ fn test_adding_subtract_timestamps_scale() { ]) .to(DataType::Timestamp( TimeUnit::Nanosecond, - Some("America/New_York".to_string()), + Some(std::sync::Arc::new("America/New_york".to_string())), )); let result = add_duration(×tamp, &duration); @@ -106,7 +106,7 @@ fn test_adding_subtract_timestamps_scale() { fn test_subtract_timestamp() { let timestamp = PrimitiveArray::from([Some(100000i64), Some(200000i64), None, Some(300000i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some("America/New_York".to_string())), + DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), ); let duration = PrimitiveArray::from([Some(10i64), Some(20i64), None, Some(30i64)]) @@ -115,7 +115,7 @@ fn test_subtract_timestamp() { let result = subtract_duration(×tamp, &duration); let expected = PrimitiveArray::from([Some(99990i64), Some(199980i64), None, Some(299970i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some("America/New_York".to_string())), + DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), ); assert_eq!(result, expected); @@ -125,11 +125,11 @@ fn test_subtract_timestamp() { fn test_subtracting_duration_different_scale() { let timestamp = PrimitiveArray::from([Some(100000i64), Some(200000i64), None, Some(300000i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some("America/New_York".to_string())), + DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), ); let expected = PrimitiveArray::from([Some(99990i64), Some(199980i64), None, Some(299970i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some("America/New_York".to_string())), + DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), ); // Testing duration in milliseconds @@ -158,21 +158,21 @@ fn test_subtracting_duration_different_scale() { #[test] fn test_subtracting_subtract_timestamps_scale() { let timestamp = PrimitiveArray::from([Some(10i64), Some(20i64), None, Some(30i64)]).to( - DataType::Timestamp(TimeUnit::Millisecond, Some("America/New_York".to_string())), + DataType::Timestamp(TimeUnit::Millisecond, Some(std::sync::Arc::new("America/New_york".to_string()))), ); let duration = PrimitiveArray::from([Some(1i64), Some(2i64), None, Some(3i64)]) .to(DataType::Duration(TimeUnit::Second)); let expected = PrimitiveArray::from([Some(-990i64), Some(-1_980i64), None, Some(-2_970i64)]).to( - DataType::Timestamp(TimeUnit::Millisecond, Some("America/New_York".to_string())), + DataType::Timestamp(TimeUnit::Millisecond, Some(std::sync::Arc::new("America/New_york".to_string()))), ); let result = subtract_duration(×tamp, &duration); assert_eq!(result, expected); let timestamp = PrimitiveArray::from([Some(10i64), Some(20i64), None, Some(30i64)]).to( - DataType::Timestamp(TimeUnit::Nanosecond, Some("America/New_York".to_string())), + DataType::Timestamp(TimeUnit::Nanosecond, Some(std::sync::Arc::new("America/New_york".to_string()))), ); let duration = PrimitiveArray::from([Some(1i64), Some(2i64), None, Some(3i64)]) .to(DataType::Duration(TimeUnit::Second)); @@ -185,7 +185,7 @@ fn test_subtracting_subtract_timestamps_scale() { ]) .to(DataType::Timestamp( TimeUnit::Nanosecond, - Some("America/New_York".to_string()), + Some(std::sync::Arc::new("America/New_york".to_string())), )); let result = subtract_duration(×tamp, &duration); @@ -342,7 +342,7 @@ fn test_add_interval() { fn test_add_interval_offset() { let timestamp = PrimitiveArray::from_slice([1i64]).to(DataType::Timestamp( TimeUnit::Second, - Some("+01:00".to_string()), + Some(std::sync::Arc::new("+01:00".to_string())), )); let interval = months_days_ns::new(0, 1, 0); @@ -351,7 +351,7 @@ fn test_add_interval_offset() { let expected = PrimitiveArray::from_slice([1i64 + 24 * 60 * 60]).to(DataType::Timestamp( TimeUnit::Second, - Some("+01:00".to_string()), + Some(std::sync::Arc::new("+01:00".to_string())), )); let result = add_interval(×tamp, &intervals).unwrap(); @@ -366,7 +366,7 @@ fn test_add_interval_offset() { fn test_add_interval_tz() { let timestamp = PrimitiveArray::from_slice([1i64]).to(DataType::Timestamp( TimeUnit::Second, - Some("GMT".to_string()), + Some(std::sync::Arc::new("GMT".to_string())), )); let interval = months_days_ns::new(0, 1, 0); @@ -374,7 +374,7 @@ fn test_add_interval_tz() { let expected = PrimitiveArray::from_slice([1i64 + 24 * 60 * 60]).to(DataType::Timestamp( TimeUnit::Second, - Some("GMT".to_string()), + Some(std::sync::Arc::new("GMT".to_string())), )); let result = add_interval(×tamp, &intervals).unwrap(); diff --git a/tests/it/compute/cast.rs b/tests/it/compute/cast.rs index 5b3db0ffc83..22ec3fd040e 100644 --- a/tests/it/compute/cast.rs +++ b/tests/it/compute/cast.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow2::array::*; use arrow2::compute::cast::{can_cast_types, cast, CastOptions}; use arrow2::datatypes::*; @@ -125,7 +127,11 @@ fn i32_to_list_i32() { let array = Int32Array::from_slice([5, 6, 7, 8, 9]); let b = cast( &array, - &DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, true))), + &DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::Int32, + true, + ))), CastOptions::default(), ) .unwrap(); @@ -149,7 +155,11 @@ fn i32_to_list_i32_nullable() { let array = Int32Array::from(input); let b = cast( &array, - &DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, true))), + &DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::Int32, + true, + ))), CastOptions::default(), ) .unwrap(); @@ -173,7 +183,11 @@ fn i32_to_list_f64_nullable_sliced() { let array = array.sliced(2, 4); let b = cast( &array, - &DataType::List(std::sync::Arc::new(Field::new("item", DataType::Float64, true))), + &DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::Float64, + true, + ))), CastOptions::default(), ) .unwrap(); @@ -483,7 +497,10 @@ fn consistency() { Float64, Timestamp(TimeUnit::Second, None), Timestamp(TimeUnit::Millisecond, None), - Timestamp(TimeUnit::Millisecond, Some("+01:00".to_string())), + Timestamp( + TimeUnit::Millisecond, + Some(std::sync::Arc::new("+01:00".to_string())), + ), Timestamp(TimeUnit::Microsecond, None), Timestamp(TimeUnit::Nanosecond, None), Time64(TimeUnit::Microsecond), @@ -622,7 +639,10 @@ fn int32_to_date32() { fn timestamp_to_date32() { test_primitive_to_primitive( &[864000000005i64, 1545696000001], - DataType::Timestamp(TimeUnit::Millisecond, Some(String::from("UTC"))), + DataType::Timestamp( + TimeUnit::Millisecond, + Some(std::sync::Arc::new("UTC".to_string())), + ), &[10000i32, 17890], DataType::Date32, ); @@ -632,7 +652,10 @@ fn timestamp_to_date32() { fn timestamp_to_date64() { test_primitive_to_primitive( &[864000000005i64, 1545696000001], - DataType::Timestamp(TimeUnit::Millisecond, Some(String::from("UTC"))), + DataType::Timestamp( + TimeUnit::Millisecond, + Some(std::sync::Arc::new("UTC".to_string())), + ), &[864000000005i64, 1545696000001i64], DataType::Date64, ); @@ -642,7 +665,10 @@ fn timestamp_to_date64() { fn timestamp_to_i64() { test_primitive_to_primitive( &[864000000005i64, 1545696000001], - DataType::Timestamp(TimeUnit::Millisecond, Some(String::from("UTC"))), + DataType::Timestamp( + TimeUnit::Millisecond, + Some(std::sync::Arc::new("UTC".to_string())), + ), &[864000000005i64, 1545696000001i64], DataType::Int64, ); @@ -759,7 +785,7 @@ fn list_to_from_fixed_size_list() { #[test] fn timestamp_with_tz_to_utf8() { - let tz = "-02:00".to_string(); + let tz = Arc::new("-02:00".to_string()); let expected = Utf8Array::::from_slice(["1996-12-19T16:39:57-02:00", "1996-12-19T17:39:57-02:00"]); let array = Int64Array::from_slice([851020797000000000, 851024397000000000]) @@ -771,7 +797,7 @@ fn timestamp_with_tz_to_utf8() { #[test] fn utf8_to_timestamp_with_tz() { - let tz = "-02:00".to_string(); + let tz = Arc::new("-02:00".to_string()); let array = Utf8Array::::from_slice(["1996-12-19T16:39:57-02:00", "1996-12-19T17:39:57-02:00"]); // the timezone is used to map the time to UTC. diff --git a/tests/it/compute/temporal.rs b/tests/it/compute/temporal.rs index 748a4dbfe60..25792b8dd74 100644 --- a/tests/it/compute/temporal.rs +++ b/tests/it/compute/temporal.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow2::array::*; use arrow2::compute::temporal::*; use arrow2::datatypes::*; @@ -198,7 +200,7 @@ fn test_data_tz() -> Vec { // Mon May 24 2021 17:25:30 GMT+0000 Int64Array::from(&[Some(1621877130000000), None]).to(DataType::Timestamp( TimeUnit::Microsecond, - Some("GMT".to_string()), + Some(std::sync::Arc::new("GMT".to_string())), )), ), year: Some(Int32Array::from(&[Some(2021), None])), @@ -213,7 +215,10 @@ fn test_data_tz() -> Vec { }, TestData { input: Box::new(Int64Array::from(&[Some(1621877130000000), None]).to( - DataType::Timestamp(TimeUnit::Microsecond, Some("+01:00".to_string())), + DataType::Timestamp( + TimeUnit::Microsecond, + Some(std::sync::Arc::new("+01:00".to_string())), + ), )), year: Some(Int32Array::from(&[Some(2021), None])), month: Some(UInt32Array::from(&[Some(5), None])), @@ -227,7 +232,10 @@ fn test_data_tz() -> Vec { }, TestData { input: Box::new(Int64Array::from(&[Some(1621877130000000), None]).to( - DataType::Timestamp(TimeUnit::Microsecond, Some("Europe/Lisbon".to_string())), + DataType::Timestamp( + TimeUnit::Microsecond, + Some(std::sync::Arc::new("Europe/Lisbon".to_string())), + ), )), year: Some(Int32Array::from(&[Some(2021), None])), month: Some(UInt32Array::from(&[Some(5), None])), @@ -244,7 +252,7 @@ fn test_data_tz() -> Vec { // Sun Mar 29 2020 00:00:00 GMT+0000 (Western European Standard Time) Int64Array::from(&[Some(1585440000), None]).to(DataType::Timestamp( TimeUnit::Second, - Some("Europe/Lisbon".to_string()), + Some(std::sync::Arc::new("Europe/Lisbon".to_string())), )), ), year: Some(Int32Array::from(&[Some(2020), None])), @@ -262,7 +270,7 @@ fn test_data_tz() -> Vec { // Sun Mar 29 2020 02:00:00 GMT+0100 (Western European Summer Time) Int64Array::from(&[Some(1585443600), None]).to(DataType::Timestamp( TimeUnit::Second, - Some("Europe/Lisbon".to_string()), + Some(std::sync::Arc::new("Europe/Lisbon".to_string())), )), ), year: Some(Int32Array::from(&[Some(2020), None])), @@ -346,7 +354,7 @@ fn consistency_check( Timestamp(TimeUnit::Millisecond, None), Timestamp(TimeUnit::Microsecond, None), Timestamp(TimeUnit::Nanosecond, None), - Timestamp(TimeUnit::Nanosecond, Some("+00:00".to_string())), + Timestamp(TimeUnit::Nanosecond, Some(Arc::new("+00:00".to_string()))), Time64(TimeUnit::Microsecond), Time64(TimeUnit::Nanosecond), Date32, diff --git a/tests/it/ffi/data.rs b/tests/it/ffi/data.rs index aedb23bc112..afb263e8531 100644 --- a/tests/it/ffi/data.rs +++ b/tests/it/ffi/data.rs @@ -3,6 +3,7 @@ use arrow2::bitmap::Bitmap; use arrow2::datatypes::{DataType, Field, TimeUnit}; use arrow2::{error::Result, ffi}; use std::collections::BTreeMap; +use std::sync::Arc; fn _test_round_trip(array: Box, expected: Box) -> Result<()> { let field = Field::new("a", array.data_type().clone(), true); @@ -93,7 +94,7 @@ fn decimal_nullable() -> Result<()> { fn timestamp_tz() -> Result<()> { let data = Int64Array::from(&vec![Some(2), None, None]).to(DataType::Timestamp( TimeUnit::Second, - Some("UTC".to_string()), + Some(Arc::new("UTC".to_string())), )); test_round_trip(data) } @@ -256,7 +257,10 @@ fn fixed_size_list_sliced() -> Result<()> { let bitmap = Bitmap::from([true, false, false, true]).sliced(1, 3); let array = FixedSizeListArray::try_new( - DataType::FixedSizeList(std::sync::Arc::new(Field::new("a", DataType::Int32, true)), 2), + DataType::FixedSizeList( + std::sync::Arc::new(Field::new("a", DataType::Int32, true)), + 2, + ), Box::new(PrimitiveArray::::from_vec(vec![1, 2, 3, 4, 5, 6])), Some(bitmap), )?; diff --git a/tests/it/io/csv/read.rs b/tests/it/io/csv/read.rs index ca9d56b1912..ee9f584d46c 100644 --- a/tests/it/io/csv/read.rs +++ b/tests/it/io/csv/read.rs @@ -426,7 +426,7 @@ fn deserialize_timestamp() -> Result<()> { let input = vec!["1996-12-19T16:34:57-02:00", "1996-12-19T16:34:58-02:00"]; let input = input.join("\n"); - let data_type = DataType::Timestamp(TimeUnit::Millisecond, Some("-01:00".to_string())); + let data_type = DataType::Timestamp(TimeUnit::Millisecond, Some(std::sync::Arc::new("-01:00".to_string()))); let expected = Int64Array::from([Some(851020497000), Some(851020498000)]).to(data_type.clone()); @@ -455,6 +455,6 @@ proptest! { #[test] #[cfg_attr(miri, ignore)] // miri and proptest do not work well :( fn dates(v in "1996-12-19T16:3[0-9]:57-02:00") { - assert_eq!(infer(v.as_bytes()), DataType::Timestamp(TimeUnit::Millisecond, Some("-02:00".to_string()))); + assert_eq!(infer(v.as_bytes()), DataType::Timestamp(TimeUnit::Millisecond, Some(std::sync::Arc::new("-02:00".to_string())))); } } diff --git a/tests/it/io/csv/write.rs b/tests/it/io/csv/write.rs index 2800c3bd77c..8527a5da895 100644 --- a/tests/it/io/csv/write.rs +++ b/tests/it/io/csv/write.rs @@ -226,7 +226,7 @@ fn data_array(column: &str) -> (Chunk>, Vec<&'static str>) { ]) .to(DataType::Timestamp( TimeUnit::Nanosecond, - Some("+01:00".to_string()), + Some(std::sync::Arc::new("+01:00".to_string())), )); ( array.boxed(), @@ -243,7 +243,7 @@ fn data_array(column: &str) -> (Chunk>, Vec<&'static str>) { ]) .to(DataType::Timestamp( TimeUnit::Nanosecond, - Some("Europe/Lisbon".to_string()), + Some(std::sync::Arc::new("Europe/Lisbon".to_string())), )); ( array.boxed(), @@ -344,7 +344,7 @@ fn write_tz_timezone_formatted_offset() -> Result<()> { PrimitiveArray::::from_slice([1_555_584_887_378_000_001, 1_555_555_555_555_000_001]) .to(DataType::Timestamp( TimeUnit::Nanosecond, - Some("+01:00".to_string()), + Some(std::sync::Arc::new("+01:00".to_string())), )); let columns = Chunk::new(vec![array.boxed()]); @@ -369,7 +369,7 @@ fn write_tz_timezone_formatted_tz() -> Result<()> { PrimitiveArray::::from_slice([1_555_584_887_378_000_001, 1_555_555_555_555_000_001]) .to(DataType::Timestamp( TimeUnit::Nanosecond, - Some("Europe/Lisbon".to_string()), + Some(std::sync::Arc::new("Europe/Lisbon".to_string())), )); let columns = Chunk::new(vec![array.boxed()]); diff --git a/tests/it/io/json/read.rs b/tests/it/io/json/read.rs index bdebaf7155c..37063165446 100644 --- a/tests/it/io/json/read.rs +++ b/tests/it/io/json/read.rs @@ -260,7 +260,7 @@ fn deserialize_timestamp_string_tz_s() -> Result<()> { let data_type = DataType::List(std::sync::Arc::new(Field::new( "item", - DataType::Timestamp(TimeUnit::Second, Some("+01:00".to_string())), + DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("+01:00".to_string()))), false, ))); @@ -268,7 +268,7 @@ fn deserialize_timestamp_string_tz_s() -> Result<()> { let expected = Int64Array::from([Some(1680870214)]).to(DataType::Timestamp( TimeUnit::Second, - Some("+01:00".to_string()), + Some(std::sync::Arc::new("+01:00".to_string())), )); assert_eq!(expected, result.as_ref()); diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index 3d8a99b3fcc..d3f651d0a90 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -586,7 +586,7 @@ pub fn pyarrow_nullable(column: &str) -> Box { PrimitiveArray::::from(i64_values).to(DataType::Timestamp(TimeUnit::Second, None)), ), "timestamp_s_utc" => Box::new(PrimitiveArray::::from(i64_values).to( - DataType::Timestamp(TimeUnit::Second, Some("UTC".to_string())), + DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("UTC".to_string()))), )), _ => unreachable!(), } @@ -739,11 +739,11 @@ pub fn pyarrow_nullable_statistics(column: &str) -> Statistics { null_count: UInt64Array::from([Some(3)]).boxed(), min_value: Box::new(Int64Array::from_slice([-256]).to(DataType::Timestamp( TimeUnit::Second, - Some("UTC".to_string()), + Some(std::sync::Arc::new("UTC".to_string())), ))), max_value: Box::new(Int64Array::from_slice([9]).to(DataType::Timestamp( TimeUnit::Second, - Some("UTC".to_string()), + Some(std::sync::Arc::new("UTC".to_string())), ))), }, _ => unreachable!(), @@ -1622,7 +1622,7 @@ fn generic_data() -> Result<(Schema, Chunk>)> { let values = PrimitiveArray::from_slice([1i64, 3]) .to(DataType::Timestamp( TimeUnit::Millisecond, - Some("UTC".to_string()), + Some(std::sync::Arc::new("UTC".to_string())), )) .boxed(); let array7 = DictionaryArray::try_from_keys(indices.clone(), values).unwrap(); diff --git a/tests/it/io/print.rs b/tests/it/io/print.rs index 8cbc15a95f8..eca079a0cfd 100644 --- a/tests/it/io/print.rs +++ b/tests/it/io/print.rs @@ -161,7 +161,7 @@ fn write_timestamp_second_with_tz() { ]; check_datetime!( i64, - DataType::Timestamp(TimeUnit::Second, Some("UTC".to_string())), + DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("UTC".to_string()))), 11111111, expected ); diff --git a/tests/it/temporal_conversions.rs b/tests/it/temporal_conversions.rs index 0d93c31a452..b4cf91487e3 100644 --- a/tests/it/temporal_conversions.rs +++ b/tests/it/temporal_conversions.rs @@ -2,6 +2,7 @@ use arrow2::array::*; use arrow2::datatypes::TimeUnit; use arrow2::temporal_conversions; use arrow2::types::months_days_ns; +use std::sync::Arc; #[test] fn naive() { @@ -125,7 +126,7 @@ fn naive_no_tz() { #[test] fn tz_aware() { - let tz = "-02:00".to_string(); + let tz = Arc::new("-02:00".to_string()); let expected = "Timestamp(Nanosecond, Some(\"-02:00\"))[1996-12-19 16:39:57 -02:00, 1996-12-19 17:39:57 -02:00, None]"; let fmt = "%Y-%m-%dT%H:%M:%S%.f%:z"; @@ -140,7 +141,7 @@ fn tz_aware() { #[test] fn tz_aware_no_timezone() { - let tz = "-02:00".to_string(); + let tz = Arc::new("-02:00".to_string()); let expected = "Timestamp(Nanosecond, Some(\"-02:00\"))[None, None, None]"; let fmt = "%Y-%m-%dT%H:%M:%S%.f"; let array = Utf8Array::::from_slice([ From d2742752b8766349409904fd2067de2d41128915 Mon Sep 17 00:00:00 2001 From: Clement Rey Date: Thu, 13 Apr 2023 18:55:06 +0200 Subject: [PATCH 3/4] Every other arm, limiting pain as much as possible --- src/array/dictionary/mod.rs | 4 +- src/array/dictionary/mutable.rs | 4 +- src/array/struct_/mod.rs | 8 ++- src/array/union/mod.rs | 4 +- src/compute/cast/dictionary_to.rs | 4 +- src/datatypes/mod.rs | 42 ++++++++---- src/ffi/schema.rs | 30 +++++---- src/io/avro/read/nested.rs | 2 +- src/io/avro/read/schema.rs | 6 +- src/io/ipc/read/schema.rs | 11 ++-- src/io/ipc/write/schema.rs | 8 ++- src/io/json/read/infer_schema.rs | 17 +++-- src/io/json_integration/read/schema.rs | 12 ++-- src/io/orc/read/mod.rs | 7 +- .../read/deserialize/binary/dictionary.rs | 6 +- .../fixed_size_binary/dictionary.rs | 6 +- .../read/deserialize/primitive/dictionary.rs | 6 +- src/io/parquet/read/deserialize/struct_.rs | 6 +- src/io/parquet/read/schema/convert.rs | 26 ++++---- src/io/parquet/read/statistics/mod.rs | 4 +- src/io/parquet/write/mod.rs | 4 +- src/io/parquet/write/pages.rs | 20 ++++-- tests/it/array/dictionary/mod.rs | 29 ++++++--- tests/it/array/fixed_size_binary/mod.rs | 4 +- tests/it/array/growable/list.rs | 4 +- tests/it/array/growable/map.rs | 2 +- tests/it/array/growable/mod.rs | 8 ++- tests/it/array/growable/struct_.rs | 2 +- tests/it/array/growable/union.rs | 6 +- tests/it/array/map/mod.rs | 11 +++- tests/it/array/mod.rs | 22 ++++--- tests/it/array/struct_/iterator.rs | 2 +- tests/it/array/struct_/mod.rs | 2 +- tests/it/array/struct_/mutable.rs | 4 +- tests/it/array/union.rs | 28 ++++---- tests/it/arrow.rs | 30 +++++---- tests/it/compute/cast.rs | 6 +- tests/it/compute/comparison.rs | 4 +- tests/it/compute/sort/row/mod.rs | 4 +- tests/it/compute/take.rs | 2 +- tests/it/ffi/data.rs | 12 ++-- tests/it/io/avro/read.rs | 30 ++++++--- tests/it/io/avro/write.rs | 38 ++++++++--- tests/it/io/ipc/mmap.rs | 6 +- tests/it/io/json/read.rs | 9 ++- tests/it/io/json/write.rs | 50 +++++++------- tests/it/io/ndjson/mod.rs | 62 +++++++++++------- tests/it/io/ndjson/read.rs | 29 ++++++--- tests/it/io/parquet/mod.rs | 65 +++++++++++++------ tests/it/io/print.rs | 4 +- tests/it/scalar/map.rs | 20 ++++-- tests/it/scalar/struct_.rs | 6 +- 52 files changed, 460 insertions(+), 278 deletions(-) diff --git a/src/array/dictionary/mod.rs b/src/array/dictionary/mod.rs index f7d4a0f43d7..91d358e4484 100644 --- a/src/array/dictionary/mod.rs +++ b/src/array/dictionary/mod.rs @@ -1,4 +1,4 @@ -use std::hint::unreachable_unchecked; +use std::{hint::unreachable_unchecked, sync::Arc}; use crate::{ bitmap::{ @@ -290,7 +290,7 @@ impl DictionaryArray { } pub(crate) fn default_data_type(values_datatype: DataType) -> DataType { - DataType::Dictionary(K::KEY_TYPE, Box::new(values_datatype), false) + DataType::Dictionary(K::KEY_TYPE, Arc::new(values_datatype), false) } /// Slices this [`DictionaryArray`]. diff --git a/src/array/dictionary/mutable.rs b/src/array/dictionary/mutable.rs index 444de34bcc4..98cd689afd8 100644 --- a/src/array/dictionary/mutable.rs +++ b/src/array/dictionary/mutable.rs @@ -55,7 +55,7 @@ impl From for MutableDictionaryArray Self { data_type: DataType::Dictionary( K::KEY_TYPE, - Box::new(values.data_type().clone()), + std::sync::Arc::new(values.data_type().clone()), false, ), keys: MutablePrimitiveArray::::new(), @@ -72,7 +72,7 @@ impl MutableDictionaryArray { Self { data_type: DataType::Dictionary( K::KEY_TYPE, - Box::new(values.data_type().clone()), + std::sync::Arc::new(values.data_type().clone()), false, ), keys: MutablePrimitiveArray::::new(), diff --git a/src/array/struct_/mod.rs b/src/array/struct_/mod.rs index 767ba8242fc..ed4e89aa47c 100644 --- a/src/array/struct_/mod.rs +++ b/src/array/struct_/mod.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use crate::{ bitmap::Bitmap, datatypes::{DataType, Field}, @@ -28,7 +30,7 @@ pub use mutable::*; /// Field::new("c", DataType::Int32, false), /// ]; /// -/// let array = StructArray::new(DataType::Struct(fields), vec![boolean, int], None); +/// let array = StructArray::new(DataType::Struct(std::sync::Arc::new(fields)), vec![boolean, int], None); /// ``` #[derive(Clone)] pub struct StructArray { @@ -69,7 +71,7 @@ impl StructArray { .try_for_each(|(index, (data_type, child))| { if data_type != child { Err(Error::oos(format!( - "The children DataTypes of a StructArray must equal the children data types. + "The children DataTypes of a StructArray must equal the children data types. However, the field {index} has data type {data_type:?} but the value has data type {child:?}" ))) } else { @@ -153,7 +155,7 @@ impl StructArray { impl StructArray { /// Deconstructs the [`StructArray`] into its individual components. #[must_use] - pub fn into_data(self) -> (Vec, Vec>, Option) { + pub fn into_data(self) -> (Arc>, Vec>, Option) { let Self { data_type, values, diff --git a/src/array/union/mod.rs b/src/array/union/mod.rs index e3e664916f8..624a6d93bc3 100644 --- a/src/array/union/mod.rs +++ b/src/array/union/mod.rs @@ -73,7 +73,7 @@ impl UnionArray { .try_for_each(|(index, (data_type, child))| { if data_type != child { Err(Error::oos(format!( - "The children DataTypes of a UnionArray must equal the children data types. + "The children DataTypes of a UnionArray must equal the children data types. However, the field {index} has data type {data_type:?} but the value has data type {child:?}" ))) } else { @@ -352,7 +352,7 @@ impl UnionArray { fn try_get_all(data_type: &DataType) -> Result { match data_type.to_logical_type() { DataType::Union(fields, ids, mode) => { - Ok((fields, ids.as_ref().map(|x| x.as_ref()), *mode)) + Ok((fields, ids.as_ref().map(|x| x.as_slice()), *mode)) } _ => Err(Error::oos( "The UnionArray requires a logical type of DataType::Union", diff --git a/src/compute/cast/dictionary_to.rs b/src/compute/cast/dictionary_to.rs index 101669f6442..39533bdb065 100644 --- a/src/compute/cast/dictionary_to.rs +++ b/src/compute/cast/dictionary_to.rs @@ -89,7 +89,7 @@ where } else { let data_type = DataType::Dictionary( K2::KEY_TYPE, - Box::new(values.data_type().clone()), + std::sync::Arc::new(values.data_type().clone()), is_ordered, ); // Safety: this is safe because given a type `T` that fits in a `usize`, casting it to type `P` either overflows or also fits in a `usize` @@ -116,7 +116,7 @@ where } else { let data_type = DataType::Dictionary( K2::KEY_TYPE, - Box::new(values.data_type().clone()), + std::sync::Arc::new(values.data_type().clone()), is_ordered, ); // some of the values may not fit in `usize` and thus this needs to be checked diff --git a/src/datatypes/mod.rs b/src/datatypes/mod.rs index dc4f187a57e..f652a60f4ab 100644 --- a/src/datatypes/mod.rs +++ b/src/datatypes/mod.rs @@ -147,10 +147,10 @@ pub enum DataType { /// A list of some logical data type whose offsets are represented as [`i64`]. LargeList(Arc), /// A nested [`DataType`] with a given number of [`Field`]s. - Struct(Vec), + Struct(Arc>), /// A nested datatype that can represent slots of differing types. /// Third argument represents mode - Union(Vec, Option>, UnionMode), + Union(Arc>, Option>>, UnionMode), /// A nested type that is represented as /// /// List> @@ -189,7 +189,7 @@ pub enum DataType { /// arrays or a limited set of primitive types as integers. /// /// The `bool` value indicates the `Dictionary` is sorted if set to `true`. - Dictionary(IntegerType, Box, bool), + Dictionary(IntegerType, Arc, bool), /// Decimal value with precision and scale /// precision is the number of digits in the number and /// scale is the number of decimal places. @@ -198,7 +198,7 @@ pub enum DataType { /// Decimal backed by 256 bits Decimal256(usize, usize), /// Extension type. - Extension(String, Box, Option), + Extension(String, Arc, Option>), } #[cfg(feature = "arrow")] @@ -239,15 +239,29 @@ impl From for arrow_schema::DataType { DataType::LargeList(f) => { Self::LargeList(Box::new(Arc::unwrap_or_clone_polyfill(f).into())) } - DataType::Struct(f) => Self::Struct(f.into_iter().map(Into::into).collect()), + DataType::Struct(f) => Self::Struct( + Arc::unwrap_or_clone_polyfill(f) + .into_iter() + .map(Into::into) + .collect(), + ), DataType::Union(fields, Some(ids), mode) => { - let ids = ids.into_iter().map(|x| x as _).collect(); - let fields = fields.into_iter().map(Into::into).collect(); + let ids = Arc::unwrap_or_clone_polyfill(ids) + .into_iter() + .map(|x| x as _) + .collect(); + let fields = Arc::unwrap_or_clone_polyfill(fields) + .into_iter() + .map(Into::into) + .collect(); Self::Union(fields, ids, mode.into()) } DataType::Union(fields, None, mode) => { let ids = (0..fields.len() as i8).collect(); - let fields = fields.into_iter().map(Into::into).collect(); + let fields = Arc::unwrap_or_clone_polyfill(fields) + .into_iter() + .map(Into::into) + .collect(); Self::Union(fields, ids, mode.into()) } DataType::Map(f, ordered) => { @@ -255,11 +269,11 @@ impl From for arrow_schema::DataType { } DataType::Dictionary(key, value, _) => Self::Dictionary( Box::new(DataType::from(key).into()), - Box::new((*value).into()), + Box::new(Arc::unwrap_or_clone_polyfill(value).into()), ), DataType::Decimal(precision, scale) => Self::Decimal128(precision as _, scale as _), DataType::Decimal256(precision, scale) => Self::Decimal256(precision as _, scale as _), - DataType::Extension(_, d, _) => (*d).into(), + DataType::Extension(_, d, _) => Arc::unwrap_or_clone_polyfill(d).into(), } } } @@ -299,10 +313,10 @@ impl From for DataType { Self::FixedSizeList(Arc::new((*f).into()), size as _) } DataType::LargeList(f) => Self::LargeList(Arc::new((*f).into())), - DataType::Struct(f) => Self::Struct(f.into_iter().map(Into::into).collect()), + DataType::Struct(f) => Self::Struct(Arc::new(f.into_iter().map(Into::into).collect())), DataType::Union(fields, ids, mode) => { - let ids = ids.into_iter().map(|x| x as _).collect(); - let fields = fields.into_iter().map(Into::into).collect(); + let ids = Arc::new(ids.into_iter().map(|x| x as _).collect()); + let fields = Arc::new(fields.into_iter().map(Into::into).collect()); Self::Union(fields, Some(ids), mode.into()) } DataType::Map(f, ordered) => Self::Map(std::sync::Arc::new((*f).into()), ordered), @@ -318,7 +332,7 @@ impl From for DataType { DataType::UInt64 => IntegerType::UInt64, d => panic!("illegal dictionary key type: {d}"), }; - Self::Dictionary(key, Box::new((*value).into()), false) + Self::Dictionary(key, Arc::new((*value).into()), false) } DataType::Decimal128(precision, scale) => Self::Decimal(precision as _, scale as _), DataType::Decimal256(precision, scale) => Self::Decimal256(precision as _, scale as _), diff --git a/src/ffi/schema.rs b/src/ffi/schema.rs index b36addc21ff..05bbddabb0c 100644 --- a/src/ffi/schema.rs +++ b/src/ffi/schema.rs @@ -87,7 +87,7 @@ impl ArrowSchema { if let Some(extension_metadata) = extension_metadata { metadata.insert( "ARROW:extension:metadata".to_string(), - extension_metadata.clone(), + extension_metadata.to_string(), ); } @@ -193,14 +193,18 @@ pub(crate) unsafe fn to_field(schema: &ArrowSchema) -> Result { let indices = to_integer_type(schema.format())?; let values = to_field(dictionary)?; let is_ordered = schema.flags & 1 == 1; - DataType::Dictionary(indices, Box::new(values.data_type().clone()), is_ordered) + DataType::Dictionary( + indices, + std::sync::Arc::new(values.data_type().clone()), + is_ordered, + ) } else { to_data_type(schema)? }; let (metadata, extension) = unsafe { metadata_from_bytes(schema.metadata) }; let data_type = if let Some((name, extension_metadata)) = extension { - DataType::Extension(name, Box::new(data_type), extension_metadata) + DataType::Extension(name, Arc::new(data_type), extension_metadata.map(Arc::new)) } else { data_type }; @@ -276,7 +280,7 @@ unsafe fn to_data_type(schema: &ArrowSchema) -> Result { let children = (0..schema.n_children as usize) .map(|x| to_field(schema.child(x))) .collect::>>()?; - DataType::Struct(children) + DataType::Struct(Arc::new(children)) } other => { match other.splitn(2, ':').collect::>()[..] { @@ -378,7 +382,7 @@ unsafe fn to_data_type(schema: &ArrowSchema) -> Result { let fields = (0..schema.n_children as usize) .map(|x| to_field(schema.child(x))) .collect::>>()?; - DataType::Union(fields, Some(type_ids), mode) + DataType::Union(Arc::new(fields), Some(Arc::new(type_ids)), mode) } _ => { return Err(Error::OutOfSpec(format!( @@ -576,40 +580,40 @@ mod tests { DataType::List(Arc::new(Field::new("example", DataType::Boolean, false))), DataType::FixedSizeList(Arc::new(Field::new("example", DataType::Boolean, false)), 2), DataType::LargeList(Arc::new(Field::new("example", DataType::Boolean, false))), - DataType::Struct(vec![ + DataType::Struct(Arc::new(vec![ Field::new("a", DataType::Int64, true), Field::new( "b", DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), true, ), - ]), + ])), DataType::Map( std::sync::Arc::new(Field::new("a", DataType::Int64, true)), true, ), DataType::Union( - vec![ + Arc::new(vec![ Field::new("a", DataType::Int64, true), Field::new( "b", DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), true, ), - ], - Some(vec![1, 2]), + ]), + Some(Arc::new(vec![1, 2])), UnionMode::Dense, ), DataType::Union( - vec![ + Arc::new(vec![ Field::new("a", DataType::Int64, true), Field::new( "b", DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), true, ), - ], - Some(vec![0, 1]), + ]), + Some(Arc::new(vec![0, 1])), UnionMode::Sparse, ), ]; diff --git a/src/io/avro/read/nested.rs b/src/io/avro/read/nested.rs index 056d9a8f836..cc752e976bb 100644 --- a/src/io/avro/read/nested.rs +++ b/src/io/avro/read/nested.rs @@ -129,7 +129,7 @@ impl FixedItemsUtf8Dictionary { Self { data_type: DataType::Dictionary( IntegerType::Int32, - Box::new(values.data_type().clone()), + std::sync::Arc::new(values.data_type().clone()), false, ), keys: MutablePrimitiveArray::::with_capacity(capacity), diff --git a/src/io/avro/read/schema.rs b/src/io/avro/read/schema.rs index 07b988fd71e..9c70b48e405 100644 --- a/src/io/avro/read/schema.rs +++ b/src/io/avro/read/schema.rs @@ -105,7 +105,7 @@ fn schema_to_field(schema: &AvroSchema, name: Option<&str>, props: Metadata) -> .iter() .map(|s| schema_to_field(s, None, Metadata::default())) .collect::>>()?; - DataType::Union(fields, None, UnionMode::Dense) + DataType::Union(Arc::new(fields), None, UnionMode::Dense) } } AvroSchema::Record(Record { fields, .. }) => { @@ -119,12 +119,12 @@ fn schema_to_field(schema: &AvroSchema, name: Option<&str>, props: Metadata) -> schema_to_field(&field.schema, Some(&field.name), props) }) .collect::>()?; - DataType::Struct(fields) + DataType::Struct(std::sync::Arc::new(fields)) } AvroSchema::Enum { .. } => { return Ok(Field::new( name.unwrap_or_default(), - DataType::Dictionary(IntegerType::Int32, Box::new(DataType::Utf8), false), + DataType::Dictionary(IntegerType::Int32, Arc::new(DataType::Utf8), false), false, )) } diff --git a/src/io/ipc/read/schema.rs b/src/io/ipc/read/schema.rs index b625f19d484..4d68f1281ad 100644 --- a/src/io/ipc/read/schema.rs +++ b/src/io/ipc/read/schema.rs @@ -133,7 +133,10 @@ fn deserialize_union(union_: UnionRef, field: FieldRef) -> Result<(DataType, Ipc fields: ipc_fields, dictionary_id: None, }; - Ok((DataType::Union(fields, ids, mode), ipc_field)) + Ok(( + DataType::Union(Arc::new(fields), ids.map(Arc::new), mode), + ipc_field, + )) } fn deserialize_map(map: MapRef, field: FieldRef) -> Result<(DataType, IpcField)> { @@ -172,7 +175,7 @@ fn deserialize_struct(field: FieldRef) -> Result<(DataType, IpcField)> { fields: ipc_fields, dictionary_id: None, }; - Ok((DataType::Struct(fields), ipc_field)) + Ok((DataType::Struct(std::sync::Arc::new(fields)), ipc_field)) } fn deserialize_list(field: FieldRef) -> Result<(DataType, IpcField)> { @@ -252,7 +255,7 @@ fn get_data_type( let (inner, mut ipc_field) = get_data_type(field, extension, false)?; ipc_field.dictionary_id = Some(dictionary.id()?); return Ok(( - DataType::Dictionary(index_type, Box::new(inner), dictionary.is_ordered()?), + DataType::Dictionary(index_type, Arc::new(inner), dictionary.is_ordered()?), ipc_field, )); } @@ -262,7 +265,7 @@ fn get_data_type( let (name, metadata) = extension; let (data_type, fields) = get_data_type(field, None, false)?; return Ok(( - DataType::Extension(name, Box::new(data_type), metadata), + DataType::Extension(name, Arc::new(data_type), metadata.map(Arc::new)), fields, )); } diff --git a/src/io/ipc/write/schema.rs b/src/io/ipc/write/schema.rs index 5c35c8104f3..e1fa2b97aba 100644 --- a/src/io/ipc/write/schema.rs +++ b/src/io/ipc/write/schema.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow_format::ipc::planus::Builder; use crate::datatypes::{ @@ -71,14 +73,14 @@ fn write_metadata(metadata: &Metadata, kv_vec: &mut Vec, + metadata: &Option>, kv_vec: &mut Vec, ) { // metadata if let Some(metadata) = metadata { let entry = arrow_format::ipc::KeyValue { key: Some("ARROW:extension:metadata".to_string()), - value: Some(metadata.clone()), + value: Some(metadata.to_string()), }; kv_vec.push(entry); } @@ -247,7 +249,7 @@ fn serialize_type(data_type: &DataType) -> arrow_format::ipc::Type { UnionMode::Dense => ipc::UnionMode::Dense, UnionMode::Sparse => ipc::UnionMode::Sparse, }, - type_ids: type_ids.clone(), + type_ids: type_ids.as_ref().map(|type_ids| type_ids.to_vec()), })), Map(_, keys_sorted) => ipc::Type::Map(Box::new(ipc::Map { keys_sorted: *keys_sorted, diff --git a/src/io/json/read/infer_schema.rs b/src/io/json/read/infer_schema.rs index 13f0c50360f..e036c802885 100644 --- a/src/io/json/read/infer_schema.rs +++ b/src/io/json/read/infer_schema.rs @@ -1,4 +1,5 @@ use std::borrow::Borrow; +use std::sync::Arc; use indexmap::map::IndexMap as HashMap; use indexmap::set::IndexSet as HashSet; @@ -82,7 +83,7 @@ fn infer_object(inner: &HashMap) -> Result { Ok(Field::new(key, dt, true)) }) .collect::>>()?; - Ok(DataType::Struct(fields)) + Ok(DataType::Struct(std::sync::Arc::new(fields))) } fn infer_array(values: &[Value]) -> Result { @@ -141,7 +142,7 @@ pub(crate) fn coerce_data_type>(datatypes: &[A]) -> DataType // all are structs => union of all fields (that may have equal names) let fields = datatypes.iter().fold(vec![], |mut acc, dt| { if let Struct(new_fields) = dt.borrow() { - acc.extend(new_fields); + acc.extend(new_fields.as_slice()); }; acc }); @@ -170,7 +171,7 @@ pub(crate) fn coerce_data_type>(datatypes: &[A]) -> DataType Field::new(name, coerce_data_type(&dts), true) }) .collect(); - return Struct(fields); + return Struct(Arc::new(fields)); } else if datatypes.len() > 2 { return Utf8; } @@ -214,11 +215,17 @@ mod test { List(std::sync::Arc::new(Field::new(ITEM_NAME, Float64, true))), ); assert_eq!( - coerce_data_type(&[Float64, List(std::sync::Arc::new(Field::new(ITEM_NAME, Int64, true)))]), + coerce_data_type(&[ + Float64, + List(std::sync::Arc::new(Field::new(ITEM_NAME, Int64, true))) + ]), List(std::sync::Arc::new(Field::new(ITEM_NAME, Float64, true))), ); assert_eq!( - coerce_data_type(&[Int64, List(std::sync::Arc::new(Field::new(ITEM_NAME, Int64, true)))]), + coerce_data_type(&[ + Int64, + List(std::sync::Arc::new(Field::new(ITEM_NAME, Int64, true))) + ]), List(std::sync::Arc::new(Field::new(ITEM_NAME, Int64, true))), ); // boolean and number are incompatible, return utf8 diff --git a/src/io/json_integration/read/schema.rs b/src/io/json_integration/read/schema.rs index 66b88f1f8b7..c2b08762570 100644 --- a/src/io/json_integration/read/schema.rs +++ b/src/io/json_integration/read/schema.rs @@ -259,7 +259,7 @@ fn to_data_type(item: &Value, mut children: Vec) -> Result { )); } } - "struct" => DataType::Struct(children), + "struct" => DataType::Struct(Arc::new(children)), "union" => { let mode = if let Some(Value::String(mode)) = item.get("mode") { UnionMode::sparse(mode == "SPARSE") @@ -267,11 +267,13 @@ fn to_data_type(item: &Value, mut children: Vec) -> Result { return Err(Error::OutOfSpec("union requires mode".to_string())); }; let ids = if let Some(Value::Array(ids)) = item.get("typeIds") { - Some(ids.iter().map(|x| x.as_i64().unwrap() as i32).collect()) + Some(Arc::new( + ids.iter().map(|x| x.as_i64().unwrap() as i32).collect(), + )) } else { return Err(Error::OutOfSpec("union requires ids".to_string())); }; - DataType::Union(children, ids, mode) + DataType::Union(Arc::new(children), ids, mode) } "map" => { let sorted_keys = if let Some(Value::Bool(sorted_keys)) = item.get("keysSorted") { @@ -370,7 +372,7 @@ fn deserialize_field(value: &Value) -> Result { let data_type = to_data_type(type_, children)?; let data_type = if let Some((name, metadata)) = extension { - DataType::Extension(name, Box::new(data_type), metadata) + DataType::Extension(name, Arc::new(data_type), metadata.map(Arc::new)) } else { data_type }; @@ -392,7 +394,7 @@ fn deserialize_field(value: &Value) -> Result { )); } }; - DataType::Dictionary(index_type, Box::new(data_type), is_ordered) + DataType::Dictionary(index_type, Arc::new(data_type), is_ordered) } else { data_type }; diff --git a/src/io/orc/read/mod.rs b/src/io/orc/read/mod.rs index 3fe4abb7f63..32c4465c0bf 100644 --- a/src/io/orc/read/mod.rs +++ b/src/io/orc/read/mod.rs @@ -1,9 +1,10 @@ //! APIs to read from [ORC format](https://orc.apache.org). use std::io::Read; +use std::sync::Arc; use crate::array::{Array, BinaryArray, BooleanArray, Int64Array, PrimitiveArray, Utf8Array}; use crate::bitmap::{Bitmap, MutableBitmap}; -use crate::datatypes::{DataType, Field, Schema}; +use crate::datatypes::{ArcExt, DataType, Field, Schema}; use crate::error::Error; use crate::offset::{Offset, Offsets}; use crate::types::NativeType; @@ -21,7 +22,7 @@ pub fn infer_schema(footer: &Footer) -> Result { let dt = infer_dt(&footer.types[0], types)?; if let DataType::Struct(fields) = dt { - Ok(fields.into()) + Ok(Arc::unwrap_or_clone_polyfill(fields).into()) } else { Err(Error::ExternalFormat( "ORC root type must be a struct".to_string(), @@ -57,7 +58,7 @@ fn infer_dt(type_: &Type, types: &[Type]) -> Result { .map(|dt| Field::new(name, dt, true)) }) .collect::, Error>>()?; - DataType::Struct(sub_types) + DataType::Struct(Arc::new(sub_types)) } kind => return Err(Error::nyi(format!("Reading {kind:?} from ORC"))), }; diff --git a/src/io/parquet/read/deserialize/binary/dictionary.rs b/src/io/parquet/read/deserialize/binary/dictionary.rs index 6f883528ef8..df8d3988c8c 100644 --- a/src/io/parquet/read/deserialize/binary/dictionary.rs +++ b/src/io/parquet/read/deserialize/binary/dictionary.rs @@ -1,11 +1,11 @@ -use std::collections::VecDeque; +use std::{collections::VecDeque, sync::Arc}; use parquet2::page::DictPage; use crate::{ array::{Array, BinaryArray, DictionaryArray, DictionaryKey, Utf8Array}, bitmap::MutableBitmap, - datatypes::{DataType, PhysicalType}, + datatypes::{ArcExt, DataType, PhysicalType}, error::Result, io::parquet::read::deserialize::nested_utils::{InitNested, NestedState}, offset::Offset, @@ -53,7 +53,7 @@ where fn read_dict(data_type: DataType, dict: &DictPage) -> Box { let data_type = match data_type { - DataType::Dictionary(_, values, _) => *values, + DataType::Dictionary(_, values, _) => Arc::unwrap_or_clone_polyfill(values), _ => data_type, }; diff --git a/src/io/parquet/read/deserialize/fixed_size_binary/dictionary.rs b/src/io/parquet/read/deserialize/fixed_size_binary/dictionary.rs index 680834ad270..27ef41312e2 100644 --- a/src/io/parquet/read/deserialize/fixed_size_binary/dictionary.rs +++ b/src/io/parquet/read/deserialize/fixed_size_binary/dictionary.rs @@ -1,11 +1,11 @@ -use std::collections::VecDeque; +use std::{collections::VecDeque, sync::Arc}; use parquet2::page::DictPage; use crate::{ array::{Array, DictionaryArray, DictionaryKey, FixedSizeBinaryArray}, bitmap::MutableBitmap, - datatypes::DataType, + datatypes::{ArcExt, DataType}, error::Result, io::parquet::read::deserialize::nested_utils::{InitNested, NestedState}, }; @@ -48,7 +48,7 @@ where fn read_dict(data_type: DataType, dict: &DictPage) -> Box { let data_type = match data_type { - DataType::Dictionary(_, values, _) => *values, + DataType::Dictionary(_, values, _) => Arc::unwrap_or_clone_polyfill(values), _ => data_type, }; diff --git a/src/io/parquet/read/deserialize/primitive/dictionary.rs b/src/io/parquet/read/deserialize/primitive/dictionary.rs index 16fec526112..b46cc6f7286 100644 --- a/src/io/parquet/read/deserialize/primitive/dictionary.rs +++ b/src/io/parquet/read/deserialize/primitive/dictionary.rs @@ -1,11 +1,11 @@ -use std::collections::VecDeque; +use std::{collections::VecDeque, sync::Arc}; use parquet2::{page::DictPage, types::NativeType as ParquetNativeType}; use crate::{ array::{Array, DictionaryArray, DictionaryKey, PrimitiveArray}, bitmap::MutableBitmap, - datatypes::DataType, + datatypes::{ArcExt, DataType}, error::Result, types::NativeType, }; @@ -24,7 +24,7 @@ where F: Copy + Fn(P) -> T, { let data_type = match data_type { - DataType::Dictionary(_, values, _) => *values, + DataType::Dictionary(_, values, _) => Arc::unwrap_or_clone_polyfill(values), _ => data_type, }; let values = deserialize_plain(&dict.buffer, op); diff --git a/src/io/parquet/read/deserialize/struct_.rs b/src/io/parquet/read/deserialize/struct_.rs index dd5776948cd..72209260df7 100644 --- a/src/io/parquet/read/deserialize/struct_.rs +++ b/src/io/parquet/read/deserialize/struct_.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use crate::array::{Array, StructArray}; use crate::datatypes::{DataType, Field}; use crate::error::Error; @@ -7,12 +9,12 @@ use super::nested_utils::{NestedArrayIter, NestedState}; /// An iterator adapter over [`NestedArrayIter`] assumed to be encoded as Struct arrays pub struct StructIterator<'a> { iters: Vec>, - fields: Vec, + fields: Arc>, } impl<'a> StructIterator<'a> { /// Creates a new [`StructIterator`] with `iters` and `fields`. - pub fn new(iters: Vec>, fields: Vec) -> Self { + pub fn new(iters: Vec>, fields: Arc>) -> Self { assert_eq!(iters.len(), fields.len()); Self { iters, fields } } diff --git a/src/io/parquet/read/schema/convert.rs b/src/io/parquet/read/schema/convert.rs index 1d6442ca52a..04dcf2d6547 100644 --- a/src/io/parquet/read/schema/convert.rs +++ b/src/io/parquet/read/schema/convert.rs @@ -238,7 +238,7 @@ fn to_struct(fields: &[ParquetType]) -> Option { if fields.is_empty() { None } else { - Some(DataType::Struct(fields)) + Some(DataType::Struct(std::sync::Arc::new(fields))) } } @@ -623,10 +623,10 @@ mod tests { // }; // } { - let arrow_struct = DataType::Struct(vec![ + let arrow_struct = DataType::Struct(Arc::new(vec![ Field::new("str", DataType::Utf8, false), Field::new("num", DataType::Int32, false), - ]); + ])); arrow_fields.push(Field::new( "my_list", DataType::List(std::sync::Arc::new(Field::new( @@ -646,7 +646,8 @@ mod tests { // } // Special case: group is named array { - let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); + let arrow_struct = + DataType::Struct(Arc::new(vec![Field::new("str", DataType::Utf8, false)])); arrow_fields.push(Field::new( "my_list", DataType::List(std::sync::Arc::new(Field::new("array", arrow_struct, true))), @@ -662,7 +663,8 @@ mod tests { // } // Special case: group named ends in _tuple { - let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]); + let arrow_struct = + DataType::Struct(Arc::new(vec![Field::new("str", DataType::Utf8, false)])); arrow_fields.push(Field::new( "my_list", DataType::List(std::sync::Arc::new(Field::new( @@ -784,10 +786,10 @@ mod tests { fn test_nested_schema() -> Result<()> { let mut arrow_fields = Vec::new(); { - let group1_fields = vec![ + let group1_fields = Arc::new(vec![ Field::new("leaf1", DataType::Boolean, false), Field::new("leaf2", DataType::Int32, false), - ]; + ]); let group1_struct = Field::new("group1", DataType::Struct(group1_fields), false); arrow_fields.push(group1_struct); @@ -822,7 +824,7 @@ mod tests { "innerGroup", DataType::List(std::sync::Arc::new(Field::new( "innerGroup", - DataType::Struct(vec![Field::new("leaf3", DataType::Int32, true)]), + DataType::Struct(Arc::new(vec![Field::new("leaf3", DataType::Int32, true)])), true, ))), true, @@ -832,10 +834,10 @@ mod tests { "outerGroup", DataType::List(std::sync::Arc::new(Field::new( "outerGroup", - DataType::Struct(vec![ + DataType::Struct(Arc::new(vec![ Field::new("leaf2", DataType::Int32, true), inner_group_list, - ]), + ])), true, ))), true, @@ -1016,7 +1018,7 @@ mod tests { ), Field::new( "struct", - DataType::Struct(vec![ + DataType::Struct(Arc::new(vec![ Field::new("bools", DataType::Boolean, false), Field::new("uint32", DataType::UInt32, false), Field::new( @@ -1028,7 +1030,7 @@ mod tests { ))), false, ), - ]), + ])), false, ), Field::new("dictionary_strings", DataType::Utf8, false), diff --git a/src/io/parquet/read/statistics/mod.rs b/src/io/parquet/read/statistics/mod.rs index b2f1766c015..7c609228f5f 100644 --- a/src/io/parquet/read/statistics/mod.rs +++ b/src/io/parquet/read/statistics/mod.rs @@ -204,12 +204,12 @@ fn make_mutable(data_type: &DataType, capacity: usize) -> Result DataType { if let DataType::Struct(fields) = data_type.to_logical_type() { - DataType::Struct( + DataType::Struct(Arc::new( fields .iter() .map(|f| Field::new(&f.name, create_dt(&f.data_type), f.is_nullable)) .collect(), - ) + )) } else if let DataType::Map(f, ordered) = data_type.to_logical_type() { DataType::Map( Arc::new(Field::new(&f.name, create_dt(&f.data_type), f.is_nullable)), diff --git a/src/io/parquet/write/mod.rs b/src/io/parquet/write/mod.rs index 4b86d707f68..2e60d0b2790 100644 --- a/src/io/parquet/write/mod.rs +++ b/src/io/parquet/write/mod.rs @@ -707,7 +707,7 @@ fn transverse_recursive T + Clone>( } Struct => { if let DataType::Struct(fields) = data_type.to_logical_type() { - for field in fields { + for field in fields.as_slice() { transverse_recursive(&field.data_type, map.clone(), encodings) } } else { @@ -717,7 +717,7 @@ fn transverse_recursive T + Clone>( Map => { if let DataType::Map(field, _) = data_type.to_logical_type() { if let DataType::Struct(fields) = field.data_type.to_logical_type() { - for field in fields { + for field in fields.as_slice() { transverse_recursive(&field.data_type, map.clone(), encodings) } } else { diff --git a/src/io/parquet/write/pages.rs b/src/io/parquet/write/pages.rs index eae4b70250b..0644f6b48d7 100644 --- a/src/io/parquet/write/pages.rs +++ b/src/io/parquet/write/pages.rs @@ -258,6 +258,8 @@ pub fn array_to_columns + Send + Sync>( #[cfg(test)] mod tests { + use std::sync::Arc; + use parquet2::schema::types::{GroupLogicalType, PrimitiveConvertedType, PrimitiveLogicalType}; use parquet2::schema::Repetition; @@ -280,7 +282,7 @@ mod tests { ]; let array = StructArray::new( - DataType::Struct(fields), + DataType::Struct(std::sync::Arc::new(fields)), vec![boolean.clone(), int.clone()], Some(Bitmap::from([true, true, false, true])), ); @@ -344,7 +346,7 @@ mod tests { ]; let array = StructArray::new( - DataType::Struct(fields), + DataType::Struct(std::sync::Arc::new(fields)), vec![boolean.clone(), int.clone()], Some(Bitmap::from([true, true, false, true])), ); @@ -355,7 +357,7 @@ mod tests { ]; let array = StructArray::new( - DataType::Struct(fields), + DataType::Struct(std::sync::Arc::new(fields)), vec![Box::new(array.clone()), Box::new(array)], None, ); @@ -447,13 +449,17 @@ mod tests { ]; let array = StructArray::new( - DataType::Struct(fields), + DataType::Struct(std::sync::Arc::new(fields)), vec![boolean.clone(), int.clone()], Some(Bitmap::from([true, true, false, true])), ); let array = ListArray::new( - DataType::List(std::sync::Arc::new(Field::new("l", array.data_type().clone(), true))), + DataType::List(std::sync::Arc::new(Field::new( + "l", + array.data_type().clone(), + true, + ))), vec![0i32, 2, 4].try_into().unwrap(), Box::new(array), None, @@ -540,10 +546,10 @@ mod tests { #[test] fn test_map() { - let kv_type = DataType::Struct(vec![ + let kv_type = DataType::Struct(Arc::new(vec![ Field::new("k", DataType::Utf8, false), Field::new("v", DataType::Int32, false), - ]); + ])); let kv_field = Field::new("kv", kv_type.clone(), false); let map_type = DataType::Map(std::sync::Arc::new(kv_field), false); diff --git a/tests/it/array/dictionary/mod.rs b/tests/it/array/dictionary/mod.rs index 0ee0c374764..0a2e882465a 100644 --- a/tests/it/array/dictionary/mod.rs +++ b/tests/it/array/dictionary/mod.rs @@ -1,12 +1,17 @@ mod mutable; +use std::sync::Arc; + use arrow2::{array::*, datatypes::DataType}; #[test] fn try_new_ok() { let values = Utf8Array::::from_slice(["a", "aa"]); - let data_type = - DataType::Dictionary(i32::KEY_TYPE, Box::new(values.data_type().clone()), false); + let data_type = DataType::Dictionary( + i32::KEY_TYPE, + std::sync::Arc::new(values.data_type().clone()), + false, + ); let array = DictionaryArray::try_new( data_type, PrimitiveArray::from_vec(vec![1, 0]), @@ -27,8 +32,11 @@ fn try_new_ok() { #[test] fn try_new_incorrect_key() { let values = Utf8Array::::from_slice(["a", "aa"]); - let data_type = - DataType::Dictionary(i16::KEY_TYPE, Box::new(values.data_type().clone()), false); + let data_type = DataType::Dictionary( + i16::KEY_TYPE, + std::sync::Arc::new(values.data_type().clone()), + false, + ); let r = DictionaryArray::try_new( data_type, @@ -47,8 +55,11 @@ fn try_new_nulls() { let value: &[&str] = &[]; let values = Utf8Array::::from_slice(value); - let data_type = - DataType::Dictionary(u32::KEY_TYPE, Box::new(values.data_type().clone()), false); + let data_type = DataType::Dictionary( + u32::KEY_TYPE, + std::sync::Arc::new(values.data_type().clone()), + false, + ); let r = DictionaryArray::try_new(data_type, keys, values.boxed()).is_ok(); assert!(r); @@ -72,7 +83,7 @@ fn try_new_incorrect_dt() { #[test] fn try_new_incorrect_values_dt() { let values = Utf8Array::::from_slice(["a", "aa"]); - let data_type = DataType::Dictionary(i32::KEY_TYPE, Box::new(DataType::LargeUtf8), false); + let data_type = DataType::Dictionary(i32::KEY_TYPE, Arc::new(DataType::LargeUtf8), false); let r = DictionaryArray::try_new( data_type, @@ -106,7 +117,7 @@ fn try_new_out_of_bounds_neg() { #[test] fn new_null() { - let dt = DataType::Dictionary(i16::KEY_TYPE, Box::new(DataType::Int32), false); + let dt = DataType::Dictionary(i16::KEY_TYPE, Arc::new(DataType::Int32), false); let array = DictionaryArray::::new_null(dt, 2); assert_eq!(format!("{array:?}"), "DictionaryArray[None, None]"); @@ -114,7 +125,7 @@ fn new_null() { #[test] fn new_empty() { - let dt = DataType::Dictionary(i16::KEY_TYPE, Box::new(DataType::Int32), false); + let dt = DataType::Dictionary(i16::KEY_TYPE, Arc::new(DataType::Int32), false); let array = DictionaryArray::::new_empty(dt); assert_eq!(format!("{array:?}"), "DictionaryArray[]"); diff --git a/tests/it/array/fixed_size_binary/mod.rs b/tests/it/array/fixed_size_binary/mod.rs index c5524248ff5..cf322086a2d 100644 --- a/tests/it/array/fixed_size_binary/mod.rs +++ b/tests/it/array/fixed_size_binary/mod.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow2::{array::FixedSizeBinaryArray, bitmap::Bitmap, buffer::Buffer, datatypes::DataType}; mod mutable; @@ -89,7 +91,7 @@ fn to() { let extension = DataType::Extension( "a".to_string(), - Box::new(DataType::FixedSizeBinary(2)), + Arc::new(DataType::FixedSizeBinary(2)), None, ); let _ = a.to(extension); diff --git a/tests/it/array/growable/list.rs b/tests/it/array/growable/list.rs index 45006b6e3d6..5709aaf929a 100644 --- a/tests/it/array/growable/list.rs +++ b/tests/it/array/growable/list.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow2::{ array::{ growable::{Growable, GrowableList}, @@ -23,7 +25,7 @@ fn extension() { let array = create_list_array(data); let data_type = - DataType::Extension("ext".to_owned(), Box::new(array.data_type().clone()), None); + DataType::Extension("ext".to_owned(), Arc::new(array.data_type().clone()), None); let array_ext = ListArray::new( data_type, array.offsets().clone(), diff --git a/tests/it/array/growable/map.rs b/tests/it/array/growable/map.rs index e98b98903b3..4025e6b52b7 100644 --- a/tests/it/array/growable/map.rs +++ b/tests/it/array/growable/map.rs @@ -29,7 +29,7 @@ fn some_values() -> (DataType, Vec>) { Field::new("key", DataType::Utf8, true), Field::new("val", DataType::Int32, true), ]; - (DataType::Struct(fields), vec![strings, ints]) + (DataType::Struct(std::sync::Arc::new(fields)), vec![strings, ints]) } #[test] diff --git a/tests/it/array/growable/mod.rs b/tests/it/array/growable/mod.rs index d4b034a13e6..d614f1b411e 100644 --- a/tests/it/array/growable/mod.rs +++ b/tests/it/array/growable/mod.rs @@ -11,6 +11,8 @@ mod struct_; mod union; mod utf8; +use std::sync::Arc; + use arrow2::array::growable::make_growable; use arrow2::array::*; use arrow2::datatypes::{DataType, Field}; @@ -49,18 +51,18 @@ fn test_make_growable_extension() { .unwrap(); make_growable(&[&array], false, 2); - let data_type = DataType::Extension("ext".to_owned(), Box::new(DataType::Int32), None); + let data_type = DataType::Extension("ext".to_owned(), Arc::new(DataType::Int32), None); let array = Int32Array::from_slice([1, 2]).to(data_type.clone()); let array_grown = make_growable(&[&array], false, 2).as_box(); assert_eq!(array_grown.data_type(), &data_type); let data_type = DataType::Extension( "ext".to_owned(), - Box::new(DataType::Struct(vec![Field::new( + Arc::new(DataType::Struct(Arc::new(vec![Field::new( "a", DataType::Int32, false, - )])), + )]))), None, ); let array = StructArray::new( diff --git a/tests/it/array/growable/struct_.rs b/tests/it/array/growable/struct_.rs index 9596f23961d..9ab9ba7303f 100644 --- a/tests/it/array/growable/struct_.rs +++ b/tests/it/array/growable/struct_.rs @@ -24,7 +24,7 @@ fn some_values() -> (DataType, Vec>) { Field::new("f1", DataType::Utf8, true), Field::new("f2", DataType::Int32, true), ]; - (DataType::Struct(fields), vec![strings, ints]) + (DataType::Struct(std::sync::Arc::new(fields)), vec![strings, ints]) } #[test] diff --git a/tests/it/array/growable/union.rs b/tests/it/array/growable/union.rs index 520a64092e4..756d4458f1f 100644 --- a/tests/it/array/growable/union.rs +++ b/tests/it/array/growable/union.rs @@ -13,7 +13,7 @@ fn sparse() -> Result<()> { Field::new("a", DataType::Int32, true), Field::new("b", DataType::Utf8, true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Sparse); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Sparse); let types = vec![0, 0, 1].into(); let fields = vec![ Int32Array::from(&[Some(1), None, Some(2)]).boxed(), @@ -45,7 +45,7 @@ fn dense() -> Result<()> { Field::new("a", DataType::Int32, true), Field::new("b", DataType::Utf8, true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Dense); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Dense); let types = vec![0, 0, 1].into(); let fields = vec![ Int32Array::from(&[Some(1), None, Some(2)]).boxed(), @@ -83,7 +83,7 @@ fn complex_dense() -> Result<()> { Field::new("c", fixed_size_type.clone(), true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Dense); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Dense); // UnionArray[1, [11, 12, 13], abcd, [21, 22, 23], 2] let types = vec![0, 2, 1, 2, 0].into(); diff --git a/tests/it/array/map/mod.rs b/tests/it/array/map/mod.rs index 1d3ab488554..1a6bbe2ffa3 100644 --- a/tests/it/array/map/mod.rs +++ b/tests/it/array/map/mod.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow2::{ array::*, datatypes::{DataType, Field}, @@ -5,11 +7,14 @@ use arrow2::{ #[test] fn basics() { - let dt = DataType::Struct(vec![ + let dt = DataType::Struct(Arc::new(vec![ Field::new("a", DataType::Utf8, true), Field::new("b", DataType::Utf8, true), - ]); - let data_type = DataType::Map(std::sync::Arc::new(Field::new("a", dt.clone(), true)), false); + ])); + let data_type = DataType::Map( + std::sync::Arc::new(Field::new("a", dt.clone(), true)), + false, + ); let field = StructArray::new( dt.clone(), diff --git a/tests/it/array/mod.rs b/tests/it/array/mod.rs index 628daa47451..188f6081c50 100644 --- a/tests/it/array/mod.rs +++ b/tests/it/array/mod.rs @@ -13,6 +13,8 @@ mod struct_; mod union; mod utf8; +use std::sync::Arc; + use arrow2::array::{clone, new_empty_array, new_null_array, Array, PrimitiveArray}; use arrow2::bitmap::Bitmap; use arrow2::datatypes::{DataType, Field, UnionMode}; @@ -34,12 +36,12 @@ fn nulls() { // unions' null count is always 0 let datatypes = vec![ DataType::Union( - vec![Field::new("a", DataType::Binary, true)], + Arc::new(vec![Field::new("a", DataType::Binary, true)]), None, UnionMode::Dense, ), DataType::Union( - vec![Field::new("a", DataType::Binary, true)], + Arc::new(vec![Field::new("a", DataType::Binary, true)]), None, UnionMode::Sparse, ), @@ -60,20 +62,20 @@ fn empty() { DataType::List(std::sync::Arc::new(Field::new("a", DataType::Binary, true))), DataType::List(std::sync::Arc::new(Field::new( "a", - DataType::Extension("ext".to_owned(), Box::new(DataType::Int32), None), + DataType::Extension("ext".to_owned(), Arc::new(DataType::Int32), None), true, ))), DataType::Union( - vec![Field::new("a", DataType::Binary, true)], + Arc::new(vec![Field::new("a", DataType::Binary, true)]), None, UnionMode::Sparse, ), DataType::Union( - vec![Field::new("a", DataType::Binary, true)], + Arc::new(vec![Field::new("a", DataType::Binary, true)]), None, UnionMode::Dense, ), - DataType::Struct(vec![Field::new("a", DataType::Int32, true)]), + DataType::Struct(Arc::new(vec![Field::new("a", DataType::Int32, true)])), ]; let a = datatypes.into_iter().all(|x| new_empty_array(x).len() == 0); assert!(a); @@ -88,20 +90,20 @@ fn empty_extension() { DataType::Binary, DataType::List(std::sync::Arc::new(Field::new("a", DataType::Binary, true))), DataType::Union( - vec![Field::new("a", DataType::Binary, true)], + Arc::new(vec![Field::new("a", DataType::Binary, true)]), None, UnionMode::Sparse, ), DataType::Union( - vec![Field::new("a", DataType::Binary, true)], + Arc::new(vec![Field::new("a", DataType::Binary, true)]), None, UnionMode::Dense, ), - DataType::Struct(vec![Field::new("a", DataType::Int32, true)]), + DataType::Struct(Arc::new(vec![Field::new("a", DataType::Int32, true)])), ]; let a = datatypes .into_iter() - .map(|dt| DataType::Extension("ext".to_owned(), Box::new(dt), None)) + .map(|dt| DataType::Extension("ext".to_owned(), Arc::new(dt), None)) .all(|x| { let a = new_empty_array(x); a.len() == 0 && matches!(a.data_type(), DataType::Extension(_, _, _)) diff --git a/tests/it/array/struct_/iterator.rs b/tests/it/array/struct_/iterator.rs index be4a5eefbb4..a7190986e2e 100644 --- a/tests/it/array/struct_/iterator.rs +++ b/tests/it/array/struct_/iterator.rs @@ -13,7 +13,7 @@ fn test_simple_iter() { ]; let array = StructArray::new( - DataType::Struct(fields), + DataType::Struct(std::sync::Arc::new(fields)), vec![boolean.clone(), int.clone()], None, ); diff --git a/tests/it/array/struct_/mod.rs b/tests/it/array/struct_/mod.rs index cd32eee3f75..5e467b03796 100644 --- a/tests/it/array/struct_/mod.rs +++ b/tests/it/array/struct_/mod.rs @@ -16,7 +16,7 @@ fn debug() { ]; let array = StructArray::new( - DataType::Struct(fields), + DataType::Struct(std::sync::Arc::new(fields)), vec![boolean.clone(), int.clone()], Some(Bitmap::from([true, true, false, true])), ); diff --git a/tests/it/array/struct_/mutable.rs b/tests/it/array/struct_/mutable.rs index 19f2f12f15e..fdc6f5f204d 100644 --- a/tests/it/array/struct_/mutable.rs +++ b/tests/it/array/struct_/mutable.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow2::{ array::*, datatypes::{DataType, Field}, @@ -7,7 +9,7 @@ use arrow2::{ fn push() { let c1 = Box::new(MutablePrimitiveArray::::new()) as Box; let values = vec![c1]; - let data_type = DataType::Struct(vec![Field::new("f1", DataType::Int32, true)]); + let data_type = DataType::Struct(Arc::new(vec![Field::new("f1", DataType::Int32, true)])); let mut a = MutableStructArray::new(data_type, values); a.value::>(0) diff --git a/tests/it/array/union.rs b/tests/it/array/union.rs index 4a8c3aee214..32e3dca0b77 100644 --- a/tests/it/array/union.rs +++ b/tests/it/array/union.rs @@ -25,7 +25,7 @@ fn sparse_debug() -> Result<()> { Field::new("a", DataType::Int32, true), Field::new("b", DataType::Utf8, true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Sparse); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Sparse); let types = vec![0, 0, 1].into(); let fields = vec![ Int32Array::from(&[Some(1), None, Some(2)]).boxed(), @@ -45,7 +45,7 @@ fn dense_debug() -> Result<()> { Field::new("a", DataType::Int32, true), Field::new("b", DataType::Utf8, true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Dense); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Dense); let types = vec![0, 0, 1].into(); let fields = vec![ Int32Array::from(&[Some(1), None, Some(2)]).boxed(), @@ -66,7 +66,7 @@ fn slice() -> Result<()> { Field::new("a", DataType::Int32, true), Field::new("b", DataType::Utf8, true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Sparse); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Sparse); let types = Buffer::from(vec![0, 0, 1]); let fields = vec![ Int32Array::from(&[Some(1), None, Some(2)]).boxed(), @@ -94,7 +94,7 @@ fn iter_sparse() -> Result<()> { Field::new("a", DataType::Int32, true), Field::new("b", DataType::Utf8, true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Sparse); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Sparse); let types = Buffer::from(vec![0, 0, 1]); let fields = vec![ Int32Array::from(&[Some(1), None, Some(2)]).boxed(), @@ -127,7 +127,7 @@ fn iter_dense() -> Result<()> { Field::new("a", DataType::Int32, true), Field::new("b", DataType::Utf8, true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Dense); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Dense); let types = Buffer::from(vec![0, 0, 1]); let offsets = Buffer::::from(vec![0, 1, 0]); let fields = vec![ @@ -161,7 +161,7 @@ fn iter_sparse_slice() -> Result<()> { Field::new("a", DataType::Int32, true), Field::new("b", DataType::Utf8, true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Sparse); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Sparse); let types = Buffer::from(vec![0, 0, 1]); let fields = vec![ Int32Array::from(&[Some(1), Some(3), Some(2)]).boxed(), @@ -187,7 +187,7 @@ fn iter_dense_slice() -> Result<()> { Field::new("a", DataType::Int32, true), Field::new("b", DataType::Utf8, true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Dense); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Dense); let types = Buffer::from(vec![0, 0, 1]); let offsets = Buffer::::from(vec![0, 1, 0]); let fields = vec![ @@ -214,7 +214,7 @@ fn scalar() -> Result<()> { Field::new("a", DataType::Int32, true), Field::new("b", DataType::Utf8, true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Dense); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Dense); let types = Buffer::from(vec![0, 0, 1]); let offsets = Buffer::::from(vec![0, 1, 0]); let fields = vec![ @@ -271,7 +271,7 @@ fn dense_without_offsets_is_error() { Field::new("a", DataType::Int32, true), Field::new("b", DataType::Utf8, true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Dense); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Dense); let types = vec![0, 0, 1].into(); let fields = vec![ Int32Array::from([Some(1), Some(3), Some(2)]).boxed(), @@ -287,7 +287,7 @@ fn fields_must_match() { Field::new("a", DataType::Int64, true), Field::new("b", DataType::Utf8, true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Sparse); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Sparse); let types = vec![0, 0, 1].into(); let fields = vec![ Int32Array::from([Some(1), Some(3), Some(2)]).boxed(), @@ -303,7 +303,7 @@ fn sparse_with_offsets_is_error() { Field::new("a", DataType::Int32, true), Field::new("b", DataType::Utf8, true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Sparse); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Sparse); let fields = vec![ Int32Array::from([Some(1), Some(3), Some(2)]).boxed(), Utf8Array::::from([Some("a"), Some("b"), Some("c")]).boxed(), @@ -321,7 +321,7 @@ fn offsets_must_be_in_bounds() { Field::new("a", DataType::Int32, true), Field::new("b", DataType::Utf8, true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Sparse); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Sparse); let fields = vec![ Int32Array::from([Some(1), Some(3), Some(2)]).boxed(), Utf8Array::::from([Some("a"), Some("b"), Some("c")]).boxed(), @@ -340,7 +340,7 @@ fn sparse_with_wrong_offsets1_is_error() { Field::new("a", DataType::Int32, true), Field::new("b", DataType::Utf8, true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Sparse); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Sparse); let fields = vec![ Int32Array::from([Some(1), Some(3), Some(2)]).boxed(), Utf8Array::::from([Some("a"), Some("b"), Some("c")]).boxed(), @@ -359,7 +359,7 @@ fn types_must_be_in_bounds() -> Result<()> { Field::new("a", DataType::Int32, true), Field::new("b", DataType::Utf8, true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Sparse); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Sparse); let fields = vec![ Int32Array::from([Some(1), Some(3), Some(2)]).boxed(), Utf8Array::::from([Some("a"), Some("b"), Some("c")]).boxed(), diff --git a/tests/it/arrow.rs b/tests/it/arrow.rs index 7f013bfe2db..387235ce409 100644 --- a/tests/it/arrow.rs +++ b/tests/it/arrow.rs @@ -142,11 +142,11 @@ fn make_struct() -> StructArray { let nulls = [true, true, false].into_iter().collect(); StructArray::new( - DataType::Struct(vec![ + DataType::Struct(Arc::new(vec![ Field::new("a1", a1.data_type().clone(), true), Field::new("a2", a2.data_type().clone(), true), Field::new("a3", a3.data_type().clone(), true), - ]), + ])), vec![Box::new(a1), Box::new(a2), Box::new(a3)], Some(nulls), ) @@ -235,7 +235,7 @@ fn test_dictionary() { let dictionary = DictionaryArray::try_new( DataType::Dictionary( IntegerType::Int16, - Box::new(values.data_type().clone()), + std::sync::Arc::new(values.data_type().clone()), false, ), keys, @@ -287,10 +287,10 @@ fn test_map() { ); let values = PrimitiveArray::::from_iter([Some(1), None, Some(3), Some(1), None]); let fields = StructArray::new( - DataType::Struct(vec![ + DataType::Struct(Arc::new(vec![ Field::new("keys", DataType::Utf8, false), // Cannot be nullable Field::new("values", DataType::Int32, true), - ]), + ])), vec![Box::new(keys), Box::new(values)], None, // Cannot be nullable ); @@ -316,10 +316,10 @@ fn test_map() { #[test] fn test_dense_union() { - let fields = vec![ + let fields = Arc::new(vec![ Field::new("a1", DataType::Int32, true), Field::new("a2", DataType::Int64, true), - ]; + ]); let a1 = PrimitiveArray::from_iter([Some(2), None]); let a2 = PrimitiveArray::from_iter([Some(2_i64), None, Some(3)]); @@ -327,7 +327,7 @@ fn test_dense_union() { let types = vec![1, 0, 0, 1, 1]; let offsets = vec![0, 0, 1, 1, 2]; let union = UnionArray::new( - DataType::Union(fields.clone(), Some(vec![0, 1]), UnionMode::Dense), + DataType::Union(fields.clone(), Some(Arc::new(vec![0, 1])), UnionMode::Dense), types.into(), vec![Box::new(a1.clone()), Box::new(a2.clone())], Some(offsets.into()), @@ -338,7 +338,7 @@ fn test_dense_union() { let types = vec![1, 4, 4, 1, 1]; let offsets = vec![0, 0, 1, 1, 2]; let union = UnionArray::new( - DataType::Union(fields, Some(vec![4, 1]), UnionMode::Dense), + DataType::Union(fields, Some(Arc::new(vec![4, 1])), UnionMode::Dense), types.into(), vec![Box::new(a1), Box::new(a2)], Some(offsets.into()), @@ -349,17 +349,21 @@ fn test_dense_union() { #[test] fn test_sparse_union() { - let fields = vec![ + let fields = Arc::new(vec![ Field::new("a1", DataType::Int32, true), Field::new("a2", DataType::Int64, true), - ]; + ]); let a1 = PrimitiveArray::from_iter([None, Some(2), None, None, None]); let a2 = PrimitiveArray::from_iter([Some(2_i64), None, None, None, Some(3)]); let types = vec![1, 0, 0, 1, 1]; let union = UnionArray::new( - DataType::Union(fields.clone(), Some(vec![0, 1]), UnionMode::Sparse), + DataType::Union( + fields.clone(), + Some(Arc::new(vec![0, 1])), + UnionMode::Sparse, + ), types.into(), vec![Box::new(a1.clone()), Box::new(a2.clone())], None, @@ -369,7 +373,7 @@ fn test_sparse_union() { let types = vec![1, 4, 4, 1, 1]; let union = UnionArray::new( - DataType::Union(fields, Some(vec![4, 1]), UnionMode::Sparse), + DataType::Union(fields, Some(Arc::new(vec![4, 1])), UnionMode::Sparse), types.into(), vec![Box::new(a1), Box::new(a2)], None, diff --git a/tests/it/compute/cast.rs b/tests/it/compute/cast.rs index 22ec3fd040e..131f834f968 100644 --- a/tests/it/compute/cast.rs +++ b/tests/it/compute/cast.rs @@ -689,7 +689,7 @@ fn utf8_to_dict() { let array = Utf8Array::::from([Some("one"), None, Some("three"), Some("one")]); // Cast to a dictionary (same value type, Utf8) - let cast_type = DataType::Dictionary(u8::KEY_TYPE, Box::new(DataType::Utf8), false); + let cast_type = DataType::Dictionary(u8::KEY_TYPE, Arc::new(DataType::Utf8), false); let result = cast(&array, &cast_type, CastOptions::default()).expect("cast failed"); let mut expected = MutableDictionaryArray::>::new(); @@ -720,7 +720,7 @@ fn i32_to_dict() { let array = Int32Array::from(&[Some(1), None, Some(3), Some(1)]); // Cast to a dictionary (same value type, Utf8) - let cast_type = DataType::Dictionary(u8::KEY_TYPE, Box::new(DataType::Int32), false); + let cast_type = DataType::Dictionary(u8::KEY_TYPE, Arc::new(DataType::Int32), false); let result = cast(&array, &cast_type, CastOptions::default()).expect("cast failed"); let mut expected = MutableDictionaryArray::>::new(); @@ -902,7 +902,7 @@ fn dict_keys() { let result = cast( &array, - &DataType::Dictionary(IntegerType::Int64, Box::new(DataType::Utf8), false), + &DataType::Dictionary(IntegerType::Int64, Arc::new(DataType::Utf8), false), CastOptions::default(), ) .expect("cast failed"); diff --git a/tests/it/compute/comparison.rs b/tests/it/compute/comparison.rs index a63bb39ce01..9e60fb071a4 100644 --- a/tests/it/compute/comparison.rs +++ b/tests/it/compute/comparison.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow2::array::*; use arrow2::bitmap::Bitmap; use arrow2::compute::comparison::{self, boolean::*, primitive, utf8}; @@ -42,7 +44,7 @@ fn consistency() { Duration(TimeUnit::Millisecond), Duration(TimeUnit::Microsecond), Duration(TimeUnit::Nanosecond), - Dictionary(IntegerType::Int32, Box::new(LargeBinary), false), + Dictionary(IntegerType::Int32, Arc::new(LargeBinary), false), ]; // array <> array diff --git a/tests/it/compute/sort/row/mod.rs b/tests/it/compute/sort/row/mod.rs index 4931689a192..4ec7617b265 100644 --- a/tests/it/compute/sort/row/mod.rs +++ b/tests/it/compute/sort/row/mod.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow2::{ array::{ Array, BinaryArray, BooleanArray, DictionaryArray, Float32Array, Int128Array, Int16Array, @@ -265,7 +267,7 @@ fn test_dictionary_nulls() { let values = Int32Array::from_iter([Some(1), Some(-1), None, Some(4), None]); let keys = Int32Array::from_iter([Some(0), Some(0), Some(1), Some(2), Some(4), None]); - let data_type = DataType::Dictionary(IntegerType::Int32, Box::new(DataType::Int32), false); + let data_type = DataType::Dictionary(IntegerType::Int32, Arc::new(DataType::Int32), false); let data = DictionaryArray::try_from_keys(keys, values.to_boxed()).unwrap(); let mut converter = RowConverter::new(vec![SortField::new(data_type)]); diff --git a/tests/it/compute/take.rs b/tests/it/compute/take.rs index feaa0d82081..54899f92f84 100644 --- a/tests/it/compute/take.rs +++ b/tests/it/compute/take.rs @@ -72,7 +72,7 @@ fn create_test_struct() -> StructArray { Field::new("b", DataType::Int32, true), ]; StructArray::new( - DataType::Struct(fields), + DataType::Struct(std::sync::Arc::new(fields)), vec![boolean.boxed(), int.boxed()], validity, ) diff --git a/tests/it/ffi/data.rs b/tests/it/ffi/data.rs index afb263e8531..b0a02561f12 100644 --- a/tests/it/ffi/data.rs +++ b/tests/it/ffi/data.rs @@ -291,7 +291,7 @@ fn list_list() -> Result<()> { #[test] fn struct_() -> Result<()> { - let data_type = DataType::Struct(vec![Field::new("a", DataType::Int32, true)]); + let data_type = DataType::Struct(Arc::new(vec![Field::new("a", DataType::Int32, true)])); let values = vec![Int32Array::from([Some(1), None, Some(3)]).boxed()]; let validity = Bitmap::from([true, false, true]); @@ -323,7 +323,7 @@ fn schema() -> Result<()> { let field = Field::new( "a", - DataType::Dictionary(u32::KEY_TYPE, Box::new(DataType::Utf8), false), + DataType::Dictionary(u32::KEY_TYPE, Arc::new(DataType::Utf8), false), true, ); test_round_trip_schema(field)?; @@ -341,8 +341,8 @@ fn extension() -> Result<()> { "a", DataType::Extension( "a".to_string(), - Box::new(DataType::Int32), - Some("bla".to_string()), + Arc::new(DataType::Int32), + Some("bla".to_string()).map(Arc::new), ), true, ); @@ -355,11 +355,11 @@ fn extension_children() -> Result<()> { "a", DataType::Extension( "b".to_string(), - Box::new(DataType::Struct(vec![Field::new( + Arc::new(DataType::Struct(Arc::new(vec![Field::new( "c", DataType::Int32, true, - )])), + )]))), None, ), true, diff --git a/tests/it/io/avro/read.rs b/tests/it/io/avro/read.rs index 88125087e09..b81c2a72e88 100644 --- a/tests/it/io/avro/read.rs +++ b/tests/it/io/avro/read.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow2::chunk::Chunk; use avro_rs::types::{Record, Value}; use avro_rs::{Codec, Writer}; @@ -73,23 +75,27 @@ pub(super) fn schema() -> (AvroSchema, Schema) { Field::new("g", DataType::Utf8, true), Field::new( "h", - DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, true))), + DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::Int32, + true, + ))), false, ), Field::new( "i", - DataType::Struct(vec![Field::new("e", DataType::Float64, false)]), + DataType::Struct(Arc::new(vec![Field::new("e", DataType::Float64, false)])), false, ), Field::new( "enum", - DataType::Dictionary(i32::KEY_TYPE, Box::new(DataType::Utf8), false), + DataType::Dictionary(i32::KEY_TYPE, Arc::new(DataType::Utf8), false), false, ), Field::new("decimal", DataType::Decimal(18, 5), false), Field::new( "nullable_struct", - DataType::Struct(vec![Field::new("e", DataType::Float64, false)]), + DataType::Struct(Arc::new(vec![Field::new("e", DataType::Float64, false)])), true, ), ]); @@ -117,7 +123,7 @@ pub(super) fn data() -> Chunk> { Utf8Array::::from([Some("foo"), None]).boxed(), array.into_box(), StructArray::new( - DataType::Struct(vec![Field::new("e", DataType::Float64, false)]), + DataType::Struct(Arc::new(vec![Field::new("e", DataType::Float64, false)])), vec![PrimitiveArray::::from_slice([1.0, 2.0]).boxed()], None, ) @@ -132,7 +138,7 @@ pub(super) fn data() -> Chunk> { .to(DataType::Decimal(18, 5)) .boxed(), StructArray::new( - DataType::Struct(vec![Field::new("e", DataType::Float64, false)]), + DataType::Struct(Arc::new(vec![Field::new("e", DataType::Float64, false)])), vec![PrimitiveArray::::from_slice([1.0, 0.0]).boxed()], Some([true, false].into()), ) @@ -331,7 +337,11 @@ fn schema_list() -> (AvroSchema, Schema) { let schema = Schema::from(vec![Field::new( "h", - DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, false))), + DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::Int32, + false, + ))), false, )]); @@ -343,7 +353,11 @@ pub(super) fn data_list() -> Chunk> { let mut array = MutableListArray::>::new_from( Default::default(), - DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, false))), + DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::Int32, + false, + ))), 0, ); array.try_extend(data).unwrap(); diff --git a/tests/it/io/avro/write.rs b/tests/it/io/avro/write.rs index 5e995e7a095..7fe4bcd57b0 100644 --- a/tests/it/io/avro/write.rs +++ b/tests/it/io/avro/write.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow2::array::*; use arrow2::chunk::Chunk; use arrow2::datatypes::*; @@ -42,20 +44,36 @@ pub(super) fn schema() -> Schema { ), Field::new( "list", - DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, true))), + DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::Int32, + true, + ))), false, ), Field::new( "list nullable", - DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, true))), + DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::Int32, + true, + ))), true, ), ]) } pub(super) fn data() -> Chunk> { - let list_dt = DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, true))); - let list_dt1 = DataType::List(std::sync::Arc::new(Field::new("item", DataType::Int32, true))); + let list_dt = DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::Int32, + true, + ))); + let list_dt1 = DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::Int32, + true, + ))); let columns = vec![ Box::new(Int64Array::from_slice([27, 47])) as Box, @@ -242,28 +260,28 @@ fn struct_schema() -> Schema { Schema::from(vec![ Field::new( "struct", - DataType::Struct(vec![ + DataType::Struct(Arc::new(vec![ Field::new("item1", DataType::Int32, false), Field::new("item2", DataType::Int32, true), - ]), + ])), false, ), Field::new( "struct nullable", - DataType::Struct(vec![ + DataType::Struct(Arc::new(vec![ Field::new("item1", DataType::Int32, false), Field::new("item2", DataType::Int32, true), - ]), + ])), true, ), ]) } fn struct_data() -> Chunk> { - let struct_dt = DataType::Struct(vec![ + let struct_dt = DataType::Struct(Arc::new(vec![ Field::new("item1", DataType::Int32, false), Field::new("item2", DataType::Int32, true), - ]); + ])); Chunk::new(vec![ Box::new(StructArray::new( diff --git a/tests/it/io/ipc/mmap.rs b/tests/it/io/ipc/mmap.rs index 11c89ae02fd..7e9533c1c7d 100644 --- a/tests/it/io/ipc/mmap.rs +++ b/tests/it/io/ipc/mmap.rs @@ -98,7 +98,11 @@ fn struct_() -> Result<()> { let array = PrimitiveArray::::from([None, None, None, Some(3), Some(4)]).boxed(); let array = StructArray::new( - DataType::Struct(vec![Field::new("f1", array.data_type().clone(), true)]), + DataType::Struct(Arc::new(vec![Field::new( + "f1", + array.data_type().clone(), + true, + )])), vec![array], Some([true, true, false, true, false].into()), ) diff --git a/tests/it/io/json/read.rs b/tests/it/io/json/read.rs index 37063165446..ba4665ef040 100644 --- a/tests/it/io/json/read.rs +++ b/tests/it/io/json/read.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow2::array::*; use arrow2::datatypes::*; use arrow2::error::Result; @@ -24,7 +26,7 @@ fn read_json() -> Result<()> { let result = read::deserialize(&json, data_type)?; let expected = StructArray::new( - DataType::Struct(vec![Field::new("a", DataType::Int64, true)]), + DataType::Struct(Arc::new(vec![Field::new("a", DataType::Int64, true)])), vec![Box::new(Int64Array::from_slice([1, 2, 3])) as _], None, ); @@ -260,7 +262,10 @@ fn deserialize_timestamp_string_tz_s() -> Result<()> { let data_type = DataType::List(std::sync::Arc::new(Field::new( "item", - DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("+01:00".to_string()))), + DataType::Timestamp( + TimeUnit::Second, + Some(std::sync::Arc::new("+01:00".to_string())), + ), false, ))); diff --git a/tests/it/io/json/write.rs b/tests/it/io/json/write.rs index ba07cf33298..adb51d10b58 100644 --- a/tests/it/io/json/write.rs +++ b/tests/it/io/json/write.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow2::datatypes::IntegerType; use arrow2::{ array::*, @@ -74,7 +76,7 @@ fn dictionary_utf8() -> Result<()> { let values = Utf8Array::::from([Some("a"), Some("b"), Some("c"), Some("d")]); let keys = PrimitiveArray::from_slice([0u32, 1, 2, 3, 1]); let array = DictionaryArray::try_new( - DataType::Dictionary(IntegerType::UInt32, Box::new(DataType::LargeUtf8), false), + DataType::Dictionary(IntegerType::UInt32, Arc::new(DataType::LargeUtf8), false), keys, Box::new(values), ) @@ -90,10 +92,10 @@ fn struct_() -> Result<()> { let c1 = Int32Array::from([Some(1), Some(2), Some(3), None, Some(5)]); let c2 = Utf8Array::::from([Some("a"), Some("b"), Some("c"), Some("d"), None]); - let data_type = DataType::Struct(vec![ + let data_type = DataType::Struct(Arc::new(vec![ Field::new("c1", c1.data_type().clone(), true), Field::new("c2", c2.data_type().clone(), true), - ]); + ])); let array = StructArray::new(data_type, vec![Box::new(c1) as _, Box::new(c2)], None); let expected = r#"[{"c1":1,"c2":"a"},{"c1":2,"c2":"b"},{"c1":3,"c2":"c"},{"c1":null,"c2":"d"},{"c1":5,"c2":null}]"#; @@ -103,14 +105,14 @@ fn struct_() -> Result<()> { #[test] fn nested_struct_with_validity() -> Result<()> { - let inner = vec![ + let inner = Arc::new(vec![ Field::new("c121", DataType::Utf8, false), Field::new("c122", DataType::Int32, false), - ]; - let fields = vec![ + ]); + let fields = Arc::new(vec![ Field::new("c11", DataType::Int32, false), Field::new("c12", DataType::Struct(inner.clone()), false), - ]; + ]); let c1 = StructArray::new( DataType::Struct(fields), @@ -130,10 +132,10 @@ fn nested_struct_with_validity() -> Result<()> { ); let c2 = Utf8Array::::from([Some("a"), Some("b"), Some("c")]); - let data_type = DataType::Struct(vec![ + let data_type = DataType::Struct(Arc::new(vec![ Field::new("c1", c1.data_type().clone(), true), Field::new("c2", c2.data_type().clone(), true), - ]); + ])); let array = StructArray::new(data_type, vec![c1.boxed(), c2.boxed()], None); let expected = r#"[{"c1":{"c11":1,"c12":null},"c2":"a"},{"c1":{"c11":null,"c12":{"c121":"f","c122":null}},"c2":"b"},{"c1":null,"c2":"c"}]"#; @@ -144,17 +146,17 @@ fn nested_struct_with_validity() -> Result<()> { #[test] fn nested_struct() -> Result<()> { let c121 = Field::new("c121", DataType::Utf8, false); - let fields = vec![ + let fields = Arc::new(vec![ Field::new("c11", DataType::Int32, false), - Field::new("c12", DataType::Struct(vec![c121.clone()]), false), - ]; + Field::new("c12", DataType::Struct(Arc::new(vec![c121.clone()])), false), + ]); let c1 = StructArray::new( DataType::Struct(fields), vec![ Int32Array::from(&[Some(1), None, Some(5)]).boxed(), StructArray::new( - DataType::Struct(vec![c121]), + DataType::Struct(Arc::new(vec![c121])), vec![Box::new(Utf8Array::::from([ Some("e"), Some("f"), @@ -169,10 +171,10 @@ fn nested_struct() -> Result<()> { let c2 = Utf8Array::::from([Some("a"), Some("b"), Some("c")]); - let data_type = DataType::Struct(vec![ + let data_type = DataType::Struct(Arc::new(vec![ Field::new("c1", c1.data_type().clone(), true), Field::new("c2", c2.data_type().clone(), true), - ]); + ])); let array = StructArray::new(data_type, vec![c1.boxed(), c2.boxed()], None); let expected = r#"[{"c1":{"c11":1,"c12":{"c121":"e"}},"c2":"a"},{"c1":{"c11":null,"c12":{"c121":"f"}},"c2":"b"},{"c1":{"c11":5,"c12":{"c121":"g"}},"c2":"c"}]"#; @@ -198,10 +200,10 @@ fn struct_with_list_field() -> Result<()> { let c2 = PrimitiveArray::from_slice([1, 2, 3, 4, 5]); - let data_type = DataType::Struct(vec![ + let data_type = DataType::Struct(Arc::new(vec![ Field::new("c1", c1.data_type().clone(), true), Field::new("c2", c2.data_type().clone(), true), - ]); + ])); let array = StructArray::new(data_type, vec![c1.boxed(), c2.boxed()], None); let expected = r#"[{"c1":["a","a1"],"c2":1},{"c1":["b"],"c2":2},{"c1":["c"],"c2":3},{"c1":["d"],"c2":4},{"c1":["e"],"c2":5}]"#; @@ -233,10 +235,10 @@ fn nested_list() -> Result<()> { let c2 = Utf8Array::::from([Some("foo"), Some("bar"), None]); - let data_type = DataType::Struct(vec![ + let data_type = DataType::Struct(Arc::new(vec![ Field::new("c1", c1.data_type().clone(), true), Field::new("c2", c2.data_type().clone(), true), - ]); + ])); let array = StructArray::new(data_type, vec![c1.boxed(), c2.boxed()], None); let expected = @@ -321,11 +323,11 @@ fn fixed_size_list_records() -> Result<()> { #[test] fn list_of_struct() -> Result<()> { - let inner = vec![Field::new("c121", DataType::Utf8, false)]; - let fields = vec![ + let inner = Arc::new(vec![Field::new("c121", DataType::Utf8, false)]); + let fields = Arc::new(vec![ Field::new("c11", DataType::Int32, false), Field::new("c12", DataType::Struct(inner.clone()), false), - ]; + ]); let c1_datatype = DataType::List(std::sync::Arc::new(Field::new( "s", DataType::Struct(fields.clone()), @@ -363,10 +365,10 @@ fn list_of_struct() -> Result<()> { let c2 = Int32Array::from_slice([1, 2, 3]); - let data_type = DataType::Struct(vec![ + let data_type = DataType::Struct(Arc::new(vec![ Field::new("c1", c1.data_type().clone(), true), Field::new("c2", c2.data_type().clone(), true), - ]); + ])); let array = StructArray::new(data_type, vec![c1.boxed(), c2.boxed()], None); let expected = r#"[{"c1":[{"c11":1,"c12":null},{"c11":null,"c12":{"c121":"f"}}],"c2":1},{"c1":null,"c2":2},{"c1":[null],"c2":3}]"#; diff --git a/tests/it/io/ndjson/mod.rs b/tests/it/io/ndjson/mod.rs index f11e15b1ed0..124fa1a6552 100644 --- a/tests/it/io/ndjson/mod.rs +++ b/tests/it/io/ndjson/mod.rs @@ -1,5 +1,7 @@ mod read; +use std::sync::Arc; + use arrow2::array::*; use arrow2::bitmap::Bitmap; use arrow2::datatypes::*; @@ -46,20 +48,28 @@ fn case_list() -> (String, Box) { "# .to_string(); - let data_type = DataType::Struct(vec![ + let data_type = DataType::Struct(Arc::new(vec![ Field::new("a", DataType::Int64, true), Field::new( "b", - DataType::List(std::sync::Arc::new(Field::new("item", DataType::Float64, true))), + DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::Float64, + true, + ))), true, ), Field::new( "c", - DataType::List(std::sync::Arc::new(Field::new("item", DataType::Boolean, true))), + DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::Boolean, + true, + ))), true, ), Field::new("d", DataType::Utf8, true), - ]); + ])); let a = Int64Array::from(&[Some(1), Some(-10), None]); let mut b = MutableListArray::>::new(); @@ -102,7 +112,7 @@ fn case_dict() -> (String, Box) { let data_type = DataType::List(std::sync::Arc::new(Field::new( "item", - DataType::Dictionary(u64::KEY_TYPE, Box::new(DataType::Utf8), false), + DataType::Dictionary(u64::KEY_TYPE, Arc::new(DataType::Utf8), false), true, ))); @@ -130,7 +140,12 @@ fn case_dict() -> (String, Box) { ( data, - StructArray::new(DataType::Struct(fields), vec![array.boxed()], None).boxed(), + StructArray::new( + DataType::Struct(std::sync::Arc::new(fields)), + vec![array.boxed()], + None, + ) + .boxed(), ) } @@ -139,12 +154,12 @@ fn case_basics() -> (String, Box) { {"a":-10, "b":-3.5, "c":true, "d":null} {"a":100000000, "b":0.6, "d":"text"}"# .to_string(); - let data_type = DataType::Struct(vec![ + let data_type = DataType::Struct(Arc::new(vec![ Field::new("a", DataType::Int64, true), Field::new("b", DataType::Float64, true), Field::new("c", DataType::Boolean, true), Field::new("d", DataType::Utf8, true), - ]); + ])); let array = StructArray::new( data_type, vec![ @@ -163,13 +178,13 @@ fn case_projection() -> (String, Box) { {"a":10, "b":-3.5, "c":true, "d":null, "e":"text"} {"a":100000000, "b":0.6, "d":"text"}"# .to_string(); - let data_type = DataType::Struct(vec![ + let data_type = DataType::Struct(Arc::new(vec![ Field::new("a", DataType::UInt32, true), Field::new("b", DataType::Float32, true), Field::new("c", DataType::Boolean, true), // note how "d" is not here Field::new("e", DataType::Binary, true), - ]); + ])); let array = StructArray::new( data_type, vec![ @@ -191,27 +206,30 @@ fn case_struct() -> (String, Box) { .to_string(); let d_field = Field::new("d", DataType::Utf8, true); - let c_field = Field::new("c", DataType::Struct(vec![d_field.clone()]), true); + let c_field = Field::new("c", DataType::Struct(Arc::new(vec![d_field.clone()])), true); let a_field = Field::new( "a", - DataType::Struct(vec![ + DataType::Struct(Arc::new(vec![ Field::new("b", DataType::Boolean, true), c_field.clone(), - ]), + ])), true, ); - let fields = vec![a_field]; + let fields = Arc::new(vec![a_field]); // build expected output let d = Utf8Array::::from([Some("text"), None, Some("text"), None]); let c = StructArray::new( - DataType::Struct(vec![d_field]), + DataType::Struct(Arc::new(vec![d_field])), vec![d.boxed()], Some([true, false, true, true].into()), ); let b = BooleanArray::from(vec![Some(true), Some(false), Some(true), None]); - let inner = DataType::Struct(vec![Field::new("b", DataType::Boolean, true), c_field]); + let inner = DataType::Struct(Arc::new(vec![ + Field::new("b", DataType::Boolean, true), + c_field, + ])); let expected = StructArray::new( inner, vec![b.boxed(), c.boxed()], @@ -228,11 +246,11 @@ fn case_struct() -> (String, Box) { fn case_nested_list() -> (String, Box) { let d_field = Field::new("d", DataType::Utf8, true); - let c_field = Field::new("c", DataType::Struct(vec![d_field.clone()]), true); + let c_field = Field::new("c", DataType::Struct(Arc::new(vec![d_field.clone()])), true); let b_field = Field::new("b", DataType::Boolean, true); let a_struct_field = Field::new( "a", - DataType::Struct(vec![b_field.clone(), c_field.clone()]), + DataType::Struct(Arc::new(vec![b_field.clone(), c_field.clone()])), true, ); let a_list_data_type = DataType::List(std::sync::Arc::new(a_struct_field)); @@ -257,7 +275,7 @@ fn case_nested_list() -> (String, Box) { ]); let c = StructArray::new( - DataType::Struct(vec![d_field]), + DataType::Struct(Arc::new(vec![d_field])), vec![d.boxed()], Some(Bitmap::from_u8_slice([0b11111011], 6)), ); @@ -271,7 +289,7 @@ fn case_nested_list() -> (String, Box) { Some(true), ]); let a_struct = StructArray::new( - DataType::Struct(vec![b_field, c_field]), + DataType::Struct(Arc::new(vec![b_field, c_field])), vec![b.boxed(), c.boxed()], None, ); @@ -283,7 +301,7 @@ fn case_nested_list() -> (String, Box) { ); let array = StructArray::new( - DataType::Struct(vec![a_field]), + DataType::Struct(Arc::new(vec![a_field])), vec![expected.boxed()], None, ) @@ -316,7 +334,7 @@ fn infer_object() -> Result<()> { let utf8_fld = Field::new("utf8", DataType::Utf8, true); let bools_fld = Field::new("bools", DataType::Boolean, true); - let expected = DataType::Struct(vec![u64_fld, f64_fld, utf8_fld, bools_fld]); + let expected = DataType::Struct(Arc::new(vec![u64_fld, f64_fld, utf8_fld, bools_fld])); let actual = infer(data)?; assert_eq!(expected, actual); diff --git a/tests/it/io/ndjson/read.rs b/tests/it/io/ndjson/read.rs index 82553ef36d2..9d85f1f51e4 100644 --- a/tests/it/io/ndjson/read.rs +++ b/tests/it/io/ndjson/read.rs @@ -1,4 +1,5 @@ use std::io::Cursor; +use std::sync::Arc; use arrow2::array::*; use arrow2::datatypes::{DataType, Field}; @@ -89,13 +90,13 @@ fn case_nested_struct() -> (String, Box) { {"a": {"a": 2.0, "b": 2}} "#; - let inner = DataType::Struct(vec![ + let inner = DataType::Struct(Arc::new(vec![ Field::new("a", DataType::Float64, true), Field::new("b", DataType::Int64, true), Field::new("c", DataType::Boolean, true), - ]); + ])); - let data_type = DataType::Struct(vec![Field::new("a", inner.clone(), true)]); + let data_type = DataType::Struct(Arc::new(vec![Field::new("a", inner.clone(), true)])); let values = vec![ Float64Array::from([Some(2.0), None, Some(2.0), Some(2.0)]).boxed(), @@ -168,20 +169,28 @@ fn infer_schema_mixed_list() -> Result<()> { {"a":3, "b":4, "c": true, "d":[1, false, "array", 2.4]} "#; - let expected = DataType::Struct(vec![ + let expected = DataType::Struct(Arc::new(vec![ Field::new("a", DataType::Int64, true), Field::new( "b", - DataType::List(std::sync::Arc::new(Field::new("item", DataType::Float64, true))), + DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::Float64, + true, + ))), true, ), Field::new( "c", - DataType::List(std::sync::Arc::new(Field::new("item", DataType::Boolean, true))), + DataType::List(std::sync::Arc::new(Field::new( + "item", + DataType::Boolean, + true, + ))), true, ), Field::new("d", DataType::Utf8, true), - ]); + ])); let result = infer(ndjson)?; @@ -240,10 +249,10 @@ fn line_break_in_values() -> Result<()> { fn invalid_read_record() -> Result<()> { let fields = vec![Field::new( "a", - DataType::Struct(vec![Field::new("a", DataType::Utf8, true)]), + DataType::Struct(Arc::new(vec![Field::new("a", DataType::Utf8, true)])), true, )]; - let data_type = DataType::Struct(fields); + let data_type = DataType::Struct(std::sync::Arc::new(fields)); let arrays = read_and_deserialize("city,lat,lng", &data_type, 1000); assert_eq!( @@ -262,7 +271,7 @@ fn skip_empty_lines() -> Result<()> { {\"a\": 3}"; - let data_type = DataType::Struct(vec![Field::new("a", DataType::Int64, true)]); + let data_type = DataType::Struct(Arc::new(vec![Field::new("a", DataType::Int64, true)])); let arrays = read_and_deserialize(ndjson, &data_type, 1000)?; assert_eq!(1, arrays.len()); diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index d3f651d0a90..6fa07bc3989 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -34,7 +34,11 @@ fn new_struct( .zip(arrays.iter()) .map(|(n, a)| Field::new(n, a.data_type().clone(), true)) .collect(); - StructArray::new(DataType::Struct(fields), arrays, validity) + StructArray::new( + DataType::Struct(std::sync::Arc::new(fields)), + arrays, + validity, + ) } pub fn read_column(mut reader: R, column: &str) -> Result { @@ -108,7 +112,11 @@ pub fn pyarrow_nested_edge(column: &str) -> Box { None, ); StructArray::new( - DataType::Struct(vec![Field::new("f1", a.data_type().clone(), true)]), + DataType::Struct(Arc::new(vec![Field::new( + "f1", + a.data_type().clone(), + true, + )])), vec![a.boxed()], None, ) @@ -586,7 +594,10 @@ pub fn pyarrow_nullable(column: &str) -> Box { PrimitiveArray::::from(i64_values).to(DataType::Timestamp(TimeUnit::Second, None)), ), "timestamp_s_utc" => Box::new(PrimitiveArray::::from(i64_values).to( - DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("UTC".to_string()))), + DataType::Timestamp( + TimeUnit::Second, + Some(std::sync::Arc::new("UTC".to_string())), + ), )), _ => unreachable!(), } @@ -1072,7 +1083,7 @@ pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics { .zip(arrays.iter()) .map(|(n, a)| Field::new(n, a.data_type().clone(), true)) .collect(); - StructArray::new(DataType::Struct(fields), arrays, None) + StructArray::new(DataType::Struct(std::sync::Arc::new(fields)), arrays, None) }; let names = vec!["f1".to_string()]; @@ -1178,18 +1189,28 @@ pub fn pyarrow_struct(column: &str) -> Box { Field::new("f2", DataType::Boolean, true), ]; match column { - "struct" => StructArray::new(DataType::Struct(fields), vec![string, boolean], None).boxed(), + "struct" => StructArray::new( + DataType::Struct(std::sync::Arc::new(fields)), + vec![string, boolean], + None, + ) + .boxed(), "struct_nullable" => { let values = vec![string, boolean]; - StructArray::new(DataType::Struct(fields), values, Some(mask.into())).boxed() + StructArray::new( + DataType::Struct(std::sync::Arc::new(fields)), + values, + Some(mask.into()), + ) + .boxed() } "struct_struct" => { let struct_ = pyarrow_struct("struct"); Box::new(StructArray::new( - DataType::Struct(vec![ - Field::new("f1", DataType::Struct(fields), true), + DataType::Struct(Arc::new(vec![ + Field::new("f1", DataType::Struct(std::sync::Arc::new(fields)), true), Field::new("f2", DataType::Boolean, true), - ]), + ])), vec![struct_, boolean], None, )) @@ -1197,10 +1218,10 @@ pub fn pyarrow_struct(column: &str) -> Box { "struct_struct_nullable" => { let struct_ = pyarrow_struct("struct"); Box::new(StructArray::new( - DataType::Struct(vec![ - Field::new("f1", DataType::Struct(fields), true), + DataType::Struct(Arc::new(vec![ + Field::new("f1", DataType::Struct(std::sync::Arc::new(fields)), true), Field::new("f2", DataType::Boolean, true), - ]), + ])), vec![struct_, boolean], Some(mask.into()), )) @@ -1381,10 +1402,10 @@ pub fn pyarrow_map(column: &str) -> Box { "map" => { let s1 = [Some("a1"), Some("a2")]; let s2 = [Some("b1"), Some("b2")]; - let dt = DataType::Struct(vec![ + let dt = DataType::Struct(Arc::new(vec![ Field::new("key", DataType::Utf8, false), Field::new("value", DataType::Utf8, true), - ]); + ])); MapArray::try_new( DataType::Map( std::sync::Arc::new(Field::new("entries", dt.clone(), false)), @@ -1409,10 +1430,10 @@ pub fn pyarrow_map(column: &str) -> Box { "map_nullable" => { let s1 = [Some("a1"), Some("a2")]; let s2 = [Some("b1"), None]; - let dt = DataType::Struct(vec![ + let dt = DataType::Struct(Arc::new(vec![ Field::new("key", DataType::Utf8, false), Field::new("value", DataType::Utf8, true), - ]); + ])); MapArray::try_new( DataType::Map( std::sync::Arc::new(Field::new("entries", dt.clone(), false)), @@ -1440,11 +1461,13 @@ pub fn pyarrow_map(column: &str) -> Box { pub fn pyarrow_map_statistics(column: &str) -> Statistics { let new_map = |arrays: Vec>, fields: Vec| { - let fields = fields - .into_iter() - .zip(arrays.iter()) - .map(|(f, a)| Field::new(f.name, a.data_type().clone(), f.is_nullable)) - .collect::>(); + let fields = Arc::new( + fields + .into_iter() + .zip(arrays.iter()) + .map(|(f, a)| Field::new(f.name, a.data_type().clone(), f.is_nullable)) + .collect::>(), + ); MapArray::new( DataType::Map( Arc::new(Field::new( diff --git a/tests/it/io/print.rs b/tests/it/io/print.rs index eca079a0cfd..3f23ea4c1ab 100644 --- a/tests/it/io/print.rs +++ b/tests/it/io/print.rs @@ -327,7 +327,7 @@ fn write_struct() -> Result<()> { let validity = Some(Bitmap::from(&[true, false, true])); - let array = StructArray::new(DataType::Struct(fields), values, validity); + let array = StructArray::new(DataType::Struct(std::sync::Arc::new(fields)), values, validity); let columns = Chunk::new(vec![&array as &dyn Array]); @@ -356,7 +356,7 @@ fn write_union() -> Result<()> { Field::new("a", DataType::Int32, true), Field::new("b", DataType::Utf8, true), ]; - let data_type = DataType::Union(fields, None, UnionMode::Sparse); + let data_type = DataType::Union(std::sync::Arc::new(fields), None, UnionMode::Sparse); let types = Buffer::from(vec![0, 0, 1]); let fields = vec![ Int32Array::from(&[Some(1), None, Some(2)]).boxed(), diff --git a/tests/it/scalar/map.rs b/tests/it/scalar/map.rs index 1fb29eeb628..b8fc5cd9601 100644 --- a/tests/it/scalar/map.rs +++ b/tests/it/scalar/map.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow2::{ array::{BooleanArray, StructArray, Utf8Array}, datatypes::{DataType, Field}, @@ -7,10 +9,10 @@ use arrow2::{ #[allow(clippy::eq_op)] #[test] fn equal() { - let kv_dt = DataType::Struct(vec![ + let kv_dt = DataType::Struct(Arc::new(vec![ Field::new("key", DataType::Utf8, false), Field::new("value", DataType::Boolean, true), - ]); + ])); let kv_array1 = StructArray::try_new( kv_dt.clone(), vec![ @@ -30,7 +32,10 @@ fn equal() { ) .unwrap(); - let dt = DataType::Map(std::sync::Arc::new(Field::new("entries", kv_dt, true)), false); + let dt = DataType::Map( + std::sync::Arc::new(Field::new("entries", kv_dt, true)), + false, + ); let a = MapScalar::new(dt.clone(), Some(Box::new(kv_array1))); let b = MapScalar::new(dt.clone(), None); assert_eq!(a, a); @@ -43,10 +48,10 @@ fn equal() { #[test] fn basics() { - let kv_dt = DataType::Struct(vec![ + let kv_dt = DataType::Struct(Arc::new(vec![ Field::new("key", DataType::Utf8, false), Field::new("value", DataType::Boolean, true), - ]); + ])); let kv_array = StructArray::try_new( kv_dt.clone(), vec![ @@ -57,7 +62,10 @@ fn basics() { ) .unwrap(); - let dt = DataType::Map(std::sync::Arc::new(Field::new("entries", kv_dt, true)), false); + let dt = DataType::Map( + std::sync::Arc::new(Field::new("entries", kv_dt, true)), + false, + ); let a = MapScalar::new(dt.clone(), Some(Box::new(kv_array.clone()))); assert_eq!(kv_array, a.values().as_ref()); diff --git a/tests/it/scalar/struct_.rs b/tests/it/scalar/struct_.rs index 2785ecb7b41..46839d3bc45 100644 --- a/tests/it/scalar/struct_.rs +++ b/tests/it/scalar/struct_.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow2::{ datatypes::{DataType, Field}, scalar::{BooleanScalar, Scalar, StructScalar}, @@ -6,7 +8,7 @@ use arrow2::{ #[allow(clippy::eq_op)] #[test] fn equal() { - let dt = DataType::Struct(vec![Field::new("a", DataType::Boolean, true)]); + let dt = DataType::Struct(Arc::new(vec![Field::new("a", DataType::Boolean, true)])); let a = StructScalar::new( dt.clone(), Some(vec![ @@ -29,7 +31,7 @@ fn equal() { #[test] fn basics() { - let dt = DataType::Struct(vec![Field::new("a", DataType::Boolean, true)]); + let dt = DataType::Struct(Arc::new(vec![Field::new("a", DataType::Boolean, true)])); let values = vec![Box::new(BooleanScalar::from(Some(true))) as Box]; From 40541b4f2ea211b3a0339db06aaf8705e953f712 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Fri, 12 Jan 2024 14:10:37 +0100 Subject: [PATCH 4/4] cargo fmt --- tests/it/array/growable/map.rs | 5 +- tests/it/array/growable/struct_.rs | 5 +- tests/it/array/growable/union.rs | 6 +- tests/it/compute/aggregate/memory.rs | 6 +- tests/it/compute/arithmetics/time.rs | 115 ++++++++++++++++----------- tests/it/io/csv/read.rs | 5 +- tests/it/io/print.rs | 11 ++- tests/it/scalar/fixed_size_list.rs | 10 ++- tests/it/scalar/list.rs | 12 ++- 9 files changed, 117 insertions(+), 58 deletions(-) diff --git a/tests/it/array/growable/map.rs b/tests/it/array/growable/map.rs index 4025e6b52b7..c1c367dcbcd 100644 --- a/tests/it/array/growable/map.rs +++ b/tests/it/array/growable/map.rs @@ -29,7 +29,10 @@ fn some_values() -> (DataType, Vec>) { Field::new("key", DataType::Utf8, true), Field::new("val", DataType::Int32, true), ]; - (DataType::Struct(std::sync::Arc::new(fields)), vec![strings, ints]) + ( + DataType::Struct(std::sync::Arc::new(fields)), + vec![strings, ints], + ) } #[test] diff --git a/tests/it/array/growable/struct_.rs b/tests/it/array/growable/struct_.rs index 9ab9ba7303f..16d600dfe24 100644 --- a/tests/it/array/growable/struct_.rs +++ b/tests/it/array/growable/struct_.rs @@ -24,7 +24,10 @@ fn some_values() -> (DataType, Vec>) { Field::new("f1", DataType::Utf8, true), Field::new("f2", DataType::Int32, true), ]; - (DataType::Struct(std::sync::Arc::new(fields)), vec![strings, ints]) + ( + DataType::Struct(std::sync::Arc::new(fields)), + vec![strings, ints], + ) } #[test] diff --git a/tests/it/array/growable/union.rs b/tests/it/array/growable/union.rs index 756d4458f1f..65185ffecff 100644 --- a/tests/it/array/growable/union.rs +++ b/tests/it/array/growable/union.rs @@ -74,8 +74,10 @@ fn dense() -> Result<()> { #[test] fn complex_dense() -> Result<()> { - let fixed_size_type = - DataType::FixedSizeList(std::sync::Arc::new(Field::new("i", DataType::UInt16, true)), 3); + let fixed_size_type = DataType::FixedSizeList( + std::sync::Arc::new(Field::new("i", DataType::UInt16, true)), + 3, + ); let fields = vec![ Field::new("a", DataType::Int32, true), diff --git a/tests/it/compute/aggregate/memory.rs b/tests/it/compute/aggregate/memory.rs index cfee4e7e38e..1c5133aa030 100644 --- a/tests/it/compute/aggregate/memory.rs +++ b/tests/it/compute/aggregate/memory.rs @@ -24,8 +24,10 @@ fn utf8() { #[test] fn fixed_size_list() { - let data_type = - DataType::FixedSizeList(std::sync::Arc::new(Field::new("elem", DataType::Float32, false)), 3); + let data_type = DataType::FixedSizeList( + std::sync::Arc::new(Field::new("elem", DataType::Float32, false)), + 3, + ); let values = Box::new(Float32Array::from_slice([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])); let a = FixedSizeListArray::new(data_type, values, None); assert_eq!(6 * std::mem::size_of::(), estimated_bytes_size(&a)); diff --git a/tests/it/compute/arithmetics/time.rs b/tests/it/compute/arithmetics/time.rs index 6fdeec358c0..2c4d8a4a023 100644 --- a/tests/it/compute/arithmetics/time.rs +++ b/tests/it/compute/arithmetics/time.rs @@ -6,42 +6,47 @@ use arrow2::types::months_days_ns; #[test] fn test_adding_timestamp() { - let timestamp = - PrimitiveArray::from([Some(100000i64), Some(200000i64), None, Some(300000i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), - ); + let timestamp = PrimitiveArray::from([Some(100000i64), Some(200000i64), None, Some(300000i64)]) + .to(DataType::Timestamp( + TimeUnit::Second, + Some(std::sync::Arc::new("America/New_york".to_string())), + )); let duration = PrimitiveArray::from([Some(10i64), Some(20i64), None, Some(30i64)]) .to(DataType::Duration(TimeUnit::Second)); let result = add_duration(×tamp, &duration); - let expected = - PrimitiveArray::from([Some(100010i64), Some(200020i64), None, Some(300030i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), - ); + let expected = PrimitiveArray::from([Some(100010i64), Some(200020i64), None, Some(300030i64)]) + .to(DataType::Timestamp( + TimeUnit::Second, + Some(std::sync::Arc::new("America/New_york".to_string())), + )); assert_eq!(result, expected); let duration = PrimitiveScalar::from(Some(10i64)).to(DataType::Duration(TimeUnit::Second)); let result = add_duration_scalar(×tamp, &duration); - let expected = - PrimitiveArray::from([Some(100010i64), Some(200010i64), None, Some(300010i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), - ); + let expected = PrimitiveArray::from([Some(100010i64), Some(200010i64), None, Some(300010i64)]) + .to(DataType::Timestamp( + TimeUnit::Second, + Some(std::sync::Arc::new("America/New_york".to_string())), + )); assert_eq!(result, expected); } #[test] fn test_adding_duration_different_scale() { - let timestamp = - PrimitiveArray::from([Some(100000i64), Some(200000i64), None, Some(300000i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), - ); - let expected = - PrimitiveArray::from([Some(100010i64), Some(200020i64), None, Some(300030i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), - ); + let timestamp = PrimitiveArray::from([Some(100000i64), Some(200000i64), None, Some(300000i64)]) + .to(DataType::Timestamp( + TimeUnit::Second, + Some(std::sync::Arc::new("America/New_york".to_string())), + )); + let expected = PrimitiveArray::from([Some(100010i64), Some(200020i64), None, Some(300030i64)]) + .to(DataType::Timestamp( + TimeUnit::Second, + Some(std::sync::Arc::new("America/New_york".to_string())), + )); // Testing duration in milliseconds let duration = PrimitiveArray::from([Some(10_000i64), Some(20_000i64), None, Some(30_000i64)]) @@ -69,20 +74,29 @@ fn test_adding_duration_different_scale() { #[test] fn test_adding_subtract_timestamps_scale() { let timestamp = PrimitiveArray::from([Some(10i64), Some(20i64), None, Some(30i64)]).to( - DataType::Timestamp(TimeUnit::Millisecond, Some(std::sync::Arc::new("America/New_york".to_string()))), + DataType::Timestamp( + TimeUnit::Millisecond, + Some(std::sync::Arc::new("America/New_york".to_string())), + ), ); let duration = PrimitiveArray::from([Some(1i64), Some(2i64), None, Some(3i64)]) .to(DataType::Duration(TimeUnit::Second)); let expected = PrimitiveArray::from([Some(1_010i64), Some(2_020i64), None, Some(3_030i64)]).to( - DataType::Timestamp(TimeUnit::Millisecond, Some(std::sync::Arc::new("America/New_york".to_string()))), + DataType::Timestamp( + TimeUnit::Millisecond, + Some(std::sync::Arc::new("America/New_york".to_string())), + ), ); let result = add_duration(×tamp, &duration); assert_eq!(result, expected); let timestamp = PrimitiveArray::from([Some(10i64), Some(20i64), None, Some(30i64)]).to( - DataType::Timestamp(TimeUnit::Nanosecond, Some(std::sync::Arc::new("America/New_york".to_string()))), + DataType::Timestamp( + TimeUnit::Nanosecond, + Some(std::sync::Arc::new("America/New_york".to_string())), + ), ); let duration = PrimitiveArray::from([Some(1i64), Some(2i64), None, Some(3i64)]) .to(DataType::Duration(TimeUnit::Second)); @@ -104,33 +118,37 @@ fn test_adding_subtract_timestamps_scale() { #[test] fn test_subtract_timestamp() { - let timestamp = - PrimitiveArray::from([Some(100000i64), Some(200000i64), None, Some(300000i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), - ); + let timestamp = PrimitiveArray::from([Some(100000i64), Some(200000i64), None, Some(300000i64)]) + .to(DataType::Timestamp( + TimeUnit::Second, + Some(std::sync::Arc::new("America/New_york".to_string())), + )); let duration = PrimitiveArray::from([Some(10i64), Some(20i64), None, Some(30i64)]) .to(DataType::Duration(TimeUnit::Second)); let result = subtract_duration(×tamp, &duration); - let expected = - PrimitiveArray::from([Some(99990i64), Some(199980i64), None, Some(299970i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), - ); + let expected = PrimitiveArray::from([Some(99990i64), Some(199980i64), None, Some(299970i64)]) + .to(DataType::Timestamp( + TimeUnit::Second, + Some(std::sync::Arc::new("America/New_york".to_string())), + )); assert_eq!(result, expected); } #[test] fn test_subtracting_duration_different_scale() { - let timestamp = - PrimitiveArray::from([Some(100000i64), Some(200000i64), None, Some(300000i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), - ); - let expected = - PrimitiveArray::from([Some(99990i64), Some(199980i64), None, Some(299970i64)]).to( - DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("America/New_york".to_string()))), - ); + let timestamp = PrimitiveArray::from([Some(100000i64), Some(200000i64), None, Some(300000i64)]) + .to(DataType::Timestamp( + TimeUnit::Second, + Some(std::sync::Arc::new("America/New_york".to_string())), + )); + let expected = PrimitiveArray::from([Some(99990i64), Some(199980i64), None, Some(299970i64)]) + .to(DataType::Timestamp( + TimeUnit::Second, + Some(std::sync::Arc::new("America/New_york".to_string())), + )); // Testing duration in milliseconds let duration = PrimitiveArray::from([Some(10_000i64), Some(20_000i64), None, Some(30_000i64)]) @@ -158,21 +176,28 @@ fn test_subtracting_duration_different_scale() { #[test] fn test_subtracting_subtract_timestamps_scale() { let timestamp = PrimitiveArray::from([Some(10i64), Some(20i64), None, Some(30i64)]).to( - DataType::Timestamp(TimeUnit::Millisecond, Some(std::sync::Arc::new("America/New_york".to_string()))), + DataType::Timestamp( + TimeUnit::Millisecond, + Some(std::sync::Arc::new("America/New_york".to_string())), + ), ); let duration = PrimitiveArray::from([Some(1i64), Some(2i64), None, Some(3i64)]) .to(DataType::Duration(TimeUnit::Second)); - let expected = - PrimitiveArray::from([Some(-990i64), Some(-1_980i64), None, Some(-2_970i64)]).to( - DataType::Timestamp(TimeUnit::Millisecond, Some(std::sync::Arc::new("America/New_york".to_string()))), - ); + let expected = PrimitiveArray::from([Some(-990i64), Some(-1_980i64), None, Some(-2_970i64)]) + .to(DataType::Timestamp( + TimeUnit::Millisecond, + Some(std::sync::Arc::new("America/New_york".to_string())), + )); let result = subtract_duration(×tamp, &duration); assert_eq!(result, expected); let timestamp = PrimitiveArray::from([Some(10i64), Some(20i64), None, Some(30i64)]).to( - DataType::Timestamp(TimeUnit::Nanosecond, Some(std::sync::Arc::new("America/New_york".to_string()))), + DataType::Timestamp( + TimeUnit::Nanosecond, + Some(std::sync::Arc::new("America/New_york".to_string())), + ), ); let duration = PrimitiveArray::from([Some(1i64), Some(2i64), None, Some(3i64)]) .to(DataType::Duration(TimeUnit::Second)); diff --git a/tests/it/io/csv/read.rs b/tests/it/io/csv/read.rs index ee9f584d46c..083b1805910 100644 --- a/tests/it/io/csv/read.rs +++ b/tests/it/io/csv/read.rs @@ -426,7 +426,10 @@ fn deserialize_timestamp() -> Result<()> { let input = vec!["1996-12-19T16:34:57-02:00", "1996-12-19T16:34:58-02:00"]; let input = input.join("\n"); - let data_type = DataType::Timestamp(TimeUnit::Millisecond, Some(std::sync::Arc::new("-01:00".to_string()))); + let data_type = DataType::Timestamp( + TimeUnit::Millisecond, + Some(std::sync::Arc::new("-01:00".to_string())), + ); let expected = Int64Array::from([Some(851020497000), Some(851020498000)]).to(data_type.clone()); diff --git a/tests/it/io/print.rs b/tests/it/io/print.rs index 3f23ea4c1ab..9ff261f9daf 100644 --- a/tests/it/io/print.rs +++ b/tests/it/io/print.rs @@ -161,7 +161,10 @@ fn write_timestamp_second_with_tz() { ]; check_datetime!( i64, - DataType::Timestamp(TimeUnit::Second, Some(std::sync::Arc::new("UTC".to_string()))), + DataType::Timestamp( + TimeUnit::Second, + Some(std::sync::Arc::new("UTC".to_string())) + ), 11111111, expected ); @@ -327,7 +330,11 @@ fn write_struct() -> Result<()> { let validity = Some(Bitmap::from(&[true, false, true])); - let array = StructArray::new(DataType::Struct(std::sync::Arc::new(fields)), values, validity); + let array = StructArray::new( + DataType::Struct(std::sync::Arc::new(fields)), + values, + validity, + ); let columns = Chunk::new(vec![&array as &dyn Array]); diff --git a/tests/it/scalar/fixed_size_list.rs b/tests/it/scalar/fixed_size_list.rs index ef8eddffb95..65f646466ef 100644 --- a/tests/it/scalar/fixed_size_list.rs +++ b/tests/it/scalar/fixed_size_list.rs @@ -7,7 +7,10 @@ use arrow2::{ #[allow(clippy::eq_op)] #[test] fn equal() { - let dt = DataType::FixedSizeList(std::sync::Arc::new(Field::new("a", DataType::Boolean, true)), 2); + let dt = DataType::FixedSizeList( + std::sync::Arc::new(Field::new("a", DataType::Boolean, true)), + 2, + ); let a = FixedSizeListScalar::new( dt.clone(), Some(BooleanArray::from_slice([true, false]).boxed()), @@ -26,7 +29,10 @@ fn equal() { #[test] fn basics() { - let dt = DataType::FixedSizeList(std::sync::Arc::new(Field::new("a", DataType::Boolean, true)), 2); + let dt = DataType::FixedSizeList( + std::sync::Arc::new(Field::new("a", DataType::Boolean, true)), + 2, + ); let a = FixedSizeListScalar::new( dt.clone(), Some(BooleanArray::from_slice([true, false]).boxed()), diff --git a/tests/it/scalar/list.rs b/tests/it/scalar/list.rs index 44e6e32b26e..ad2eed126d2 100644 --- a/tests/it/scalar/list.rs +++ b/tests/it/scalar/list.rs @@ -7,7 +7,11 @@ use arrow2::{ #[allow(clippy::eq_op)] #[test] fn equal() { - let dt = DataType::List(std::sync::Arc::new(Field::new("a", DataType::Boolean, true))); + let dt = DataType::List(std::sync::Arc::new(Field::new( + "a", + DataType::Boolean, + true, + ))); let a = ListScalar::::new( dt.clone(), Some(BooleanArray::from_slice([true, false]).boxed()), @@ -23,7 +27,11 @@ fn equal() { #[test] fn basics() { - let dt = DataType::List(std::sync::Arc::new(Field::new("a", DataType::Boolean, true))); + let dt = DataType::List(std::sync::Arc::new(Field::new( + "a", + DataType::Boolean, + true, + ))); let a = ListScalar::::new( dt.clone(), Some(BooleanArray::from_slice([true, false]).boxed()),