diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 7792e245febe..adafbfa9b72c 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -250,285 +250,6 @@ impl Field { } } - /// Parse a `Field` definition from a JSON representation. - #[cfg(feature = "json")] - pub fn from(json: &serde_json::Value) -> Result { - use serde_json::Value; - match *json { - Value::Object(ref map) => { - let name = match map.get("name") { - Some(&Value::String(ref name)) => name.to_string(), - _ => { - return Err(ArrowError::ParseError( - "Field missing 'name' attribute".to_string(), - )); - } - }; - let nullable = match map.get("nullable") { - Some(&Value::Bool(b)) => b, - _ => { - return Err(ArrowError::ParseError( - "Field missing 'nullable' attribute".to_string(), - )); - } - }; - let data_type = match map.get("type") { - Some(t) => DataType::from(t)?, - _ => { - return Err(ArrowError::ParseError( - "Field missing 'type' attribute".to_string(), - )); - } - }; - - // Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz - let metadata = match map.get("metadata") { - Some(&Value::Array(ref values)) => { - let mut res: BTreeMap = BTreeMap::new(); - for value in values { - match value.as_object() { - Some(map) => { - if map.len() != 2 { - return Err(ArrowError::ParseError( - "Field 'metadata' must have exact two entries for each key-value map".to_string(), - )); - } - if let (Some(k), Some(v)) = - (map.get("key"), map.get("value")) - { - if let (Some(k_str), Some(v_str)) = - (k.as_str(), v.as_str()) - { - res.insert( - k_str.to_string().clone(), - v_str.to_string().clone(), - ); - } else { - return Err(ArrowError::ParseError("Field 'metadata' must have map value of string type".to_string())); - } - } else { - return Err(ArrowError::ParseError("Field 'metadata' lacks map keys named \"key\" or \"value\"".to_string())); - } - } - _ => { - return Err(ArrowError::ParseError( - "Field 'metadata' contains non-object key-value pair".to_string(), - )); - } - } - } - Some(res) - } - // We also support map format, because Schema's metadata supports this. - // See https://github.com/apache/arrow/pull/5907 - Some(&Value::Object(ref values)) => { - let mut res: BTreeMap = BTreeMap::new(); - for (k, v) in values { - if let Some(str_value) = v.as_str() { - res.insert(k.clone(), str_value.to_string().clone()); - } else { - return Err(ArrowError::ParseError( - format!("Field 'metadata' contains non-string value for key {}", k), - )); - } - } - Some(res) - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field `metadata` is not json array".to_string(), - )); - } - _ => None, - }; - - // if data_type is a struct or list, get its children - let data_type = match data_type { - DataType::List(_) - | DataType::LargeList(_) - | DataType::FixedSizeList(_, _) => match map.get("children") { - Some(Value::Array(values)) => { - if values.len() != 1 { - return Err(ArrowError::ParseError( - "Field 'children' must have one element for a list data type".to_string(), - )); - } - match data_type { - DataType::List(_) => { - DataType::List(Box::new(Self::from(&values[0])?)) - } - DataType::LargeList(_) => { - DataType::LargeList(Box::new(Self::from(&values[0])?)) - } - DataType::FixedSizeList(_, int) => DataType::FixedSizeList( - Box::new(Self::from(&values[0])?), - int, - ), - _ => unreachable!( - "Data type should be a list, largelist or fixedsizelist" - ), - } - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field 'children' must be an array".to_string(), - )) - } - None => { - return Err(ArrowError::ParseError( - "Field missing 'children' attribute".to_string(), - )); - } - }, - DataType::Struct(mut fields) => match map.get("children") { - Some(Value::Array(values)) => { - let struct_fields: Result, _> = - values.iter().map(Field::from).collect(); - fields.append(&mut struct_fields?); - DataType::Struct(fields) - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field 'children' must be an array".to_string(), - )) - } - None => { - return Err(ArrowError::ParseError( - "Field missing 'children' attribute".to_string(), - )); - } - }, - DataType::Map(_, keys_sorted) => { - match map.get("children") { - Some(Value::Array(values)) if values.len() == 1 => { - let child = Self::from(&values[0])?; - // child must be a struct - match child.data_type() { - DataType::Struct(map_fields) if map_fields.len() == 2 => { - DataType::Map(Box::new(child), keys_sorted) - } - t => { - return Err(ArrowError::ParseError( - format!("Map children should be a struct with 2 fields, found {:?}", t) - )) - } - } - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field 'children' must be an array with 1 element" - .to_string(), - )) - } - None => { - return Err(ArrowError::ParseError( - "Field missing 'children' attribute".to_string(), - )); - } - } - } - DataType::Union(_, type_ids, mode) => match map.get("children") { - Some(Value::Array(values)) => { - let union_fields: Vec = values - .iter() - .map(Field::from) - .collect::>()?; - DataType::Union(union_fields, type_ids, mode) - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field 'children' must be an array".to_string(), - )) - } - None => { - return Err(ArrowError::ParseError( - "Field missing 'children' attribute".to_string(), - )); - } - }, - _ => data_type, - }; - - let mut dict_id = 0; - let mut dict_is_ordered = false; - - let data_type = match map.get("dictionary") { - Some(dictionary) => { - let index_type = match dictionary.get("indexType") { - Some(t) => DataType::from(t)?, - _ => { - return Err(ArrowError::ParseError( - "Field missing 'indexType' attribute".to_string(), - )); - } - }; - dict_id = match dictionary.get("id") { - Some(Value::Number(n)) => n.as_i64().unwrap(), - _ => { - return Err(ArrowError::ParseError( - "Field missing 'id' attribute".to_string(), - )); - } - }; - dict_is_ordered = match dictionary.get("isOrdered") { - Some(&Value::Bool(n)) => n, - _ => { - return Err(ArrowError::ParseError( - "Field missing 'isOrdered' attribute".to_string(), - )); - } - }; - DataType::Dictionary(Box::new(index_type), Box::new(data_type)) - } - _ => data_type, - }; - Ok(Field { - name, - data_type, - nullable, - dict_id, - dict_is_ordered, - metadata, - }) - } - _ => Err(ArrowError::ParseError( - "Invalid json value type for field".to_string(), - )), - } - } - - /// Generate a JSON representation of the `Field`. - #[cfg(feature = "json")] - pub fn to_json(&self) -> serde_json::Value { - let children: Vec = match self.data_type() { - DataType::Struct(fields) => fields.iter().map(|f| f.to_json()).collect(), - DataType::List(field) - | DataType::LargeList(field) - | DataType::FixedSizeList(field, _) - | DataType::Map(field, _) => vec![field.to_json()], - _ => vec![], - }; - match self.data_type() { - DataType::Dictionary(ref index_type, ref value_type) => serde_json::json!({ - "name": self.name, - "nullable": self.nullable, - "type": value_type.to_json(), - "children": children, - "dictionary": { - "id": self.dict_id, - "indexType": index_type.to_json(), - "isOrdered": self.dict_is_ordered - } - }), - _ => serde_json::json!({ - "name": self.name, - "nullable": self.nullable, - "type": self.data_type.to_json(), - "children": children - }), - } - } - /// Merge this field into self if it is compatible. /// /// Struct fields are merged recursively. diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 8bfc26d5f318..ac69d6e8fdea 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -234,82 +234,6 @@ impl Schema { .find(|&(_, c)| c.name() == name) } - /// Generate a JSON representation of the `Schema`. - #[cfg(feature = "json")] - pub fn to_json(&self) -> serde_json::Value { - serde_json::json!({ - "fields": self.fields.iter().map(|field| field.to_json()).collect::>(), - "metadata": serde_json::to_value(&self.metadata).unwrap() - }) - } - - /// Parse a `Schema` definition from a JSON representation. - #[cfg(feature = "json")] - pub fn from(json: &serde_json::Value) -> Result { - use serde_json::Value; - match *json { - Value::Object(ref schema) => { - let fields = if let Some(Value::Array(fields)) = schema.get("fields") { - fields.iter().map(Field::from).collect::>()? - } else { - return Err(ArrowError::ParseError( - "Schema fields should be an array".to_string(), - )); - }; - - let metadata = if let Some(value) = schema.get("metadata") { - Self::from_metadata(value)? - } else { - HashMap::default() - }; - - Ok(Self { fields, metadata }) - } - _ => Err(ArrowError::ParseError( - "Invalid json value type for schema".to_string(), - )), - } - } - - /// Parse a `metadata` definition from a JSON representation. - /// The JSON can either be an Object or an Array of Objects. - #[cfg(feature = "json")] - fn from_metadata( - json: &serde_json::Value, - ) -> Result, ArrowError> { - use serde_json::Value; - match json { - Value::Array(_) => { - let mut hashmap = HashMap::new(); - let values: Vec = serde_json::from_value(json.clone()) - .map_err(|_| { - ArrowError::ParseError( - "Unable to parse object into key-value pair".to_string(), - ) - })?; - for meta in values { - hashmap.insert(meta.key.clone(), meta.value); - } - Ok(hashmap) - } - Value::Object(md) => md - .iter() - .map(|(k, v)| { - if let Value::String(v) = v { - Ok((k.to_string(), v.to_string())) - } else { - Err(ArrowError::ParseError( - "metadata `value` field must be a string".to_string(), - )) - } - }) - .collect::>(), - _ => Err(ArrowError::ParseError( - "`metadata` field must be an object".to_string(), - )), - } - } - /// Check to see if `self` is a superset of `other` schema. Here are the comparison rules: /// /// * `self` and `other` should contain the same number of fields @@ -358,13 +282,6 @@ impl Hash for Schema { } } -#[cfg(feature = "json")] -#[derive(serde::Deserialize)] -struct MetadataKeyValue { - key: String, - value: String, -} - #[cfg(test)] mod tests { use super::*; diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index 6638ae1e87df..a344407e426d 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -32,7 +32,7 @@ use crate::buffer::Buffer; use crate::buffer::MutableBuffer; use crate::compute::kernels::arity::unary; use crate::compute::util::combine_option_bitmap; -use crate::compute::{binary, try_binary, try_unary, unary_dyn}; +use crate::compute::{binary, binary_opt, try_binary, try_unary, unary_dyn}; use crate::datatypes::{ native_op::ArrowNativeTypeOp, ArrowNumericType, DataType, Date32Type, Date64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, IntervalYearMonthType, @@ -711,7 +711,7 @@ where } /// Perform `left + right` operation on two arrays. If either left or right value is null -/// then the result is also null. Once +/// then the result is also null. /// /// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, /// use `add` instead. @@ -1118,6 +1118,32 @@ where return math_checked_divide_op(left, right, |a, b| a.div_checked(b)); } +/// Perform `left / right` operation on two arrays. If either left or right value is null +/// then the result is also null. +/// +/// If any right hand value is zero, the operation value will be replaced with null in the +/// result. +/// +/// Unlike `divide` or `divide_checked`, division by zero will get a null value instead +/// returning an `Err`, this also doesn't check overflowing, overflowing will just wrap +/// the result around. +pub fn divide_opt( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result> +where + T: ArrowNumericType, + T::Native: ArrowNativeTypeOp + Zero + One, +{ + Ok(binary_opt(left, right, |a, b| { + if b.is_zero() { + None + } else { + Some(a.div_wrapping(b)) + } + })) +} + /// Perform `left / right` operation on two arrays. If either left or right value is null /// then the result is also null. If any right hand value is zero then the result of this /// operation will be `Err(ArrowError::DivideByZero)`. @@ -1152,7 +1178,7 @@ pub fn divide( right: &PrimitiveArray, ) -> Result> where - T: datatypes::ArrowNumericType, + T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { math_op(left, right, |a, b| a.div_wrapping(b)) @@ -2195,4 +2221,23 @@ mod tests { let overflow = multiply_scalar_checked(&a, i32::MAX); overflow.expect_err("overflow should be detected"); } + + #[test] + fn test_primitive_div_opt_overflow_division_by_zero() { + let a = Int32Array::from(vec![i32::MIN]); + let b = Int32Array::from(vec![-1]); + + let wrapped = divide(&a, &b); + let expected = Int32Array::from(vec![-2147483648]); + assert_eq!(expected, wrapped.unwrap()); + + let overflow = divide_opt(&a, &b); + let expected = Int32Array::from(vec![-2147483648]); + assert_eq!(expected, overflow.unwrap()); + + let b = Int32Array::from(vec![0]); + let overflow = divide_opt(&a, &b); + let expected = Int32Array::from(vec![None]); + assert_eq!(expected, overflow.unwrap()); + } } diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index ee3ff5e23a83..fffa81af8190 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -18,7 +18,7 @@ //! Defines kernels suitable to perform operations to primitive arrays. use crate::array::{ - Array, ArrayData, ArrayRef, BufferBuilder, DictionaryArray, PrimitiveArray, + Array, ArrayData, ArrayIter, ArrayRef, BufferBuilder, DictionaryArray, PrimitiveArray, }; use crate::buffer::Buffer; use crate::compute::util::combine_option_bitmap; @@ -257,6 +257,59 @@ where Ok(unsafe { build_primitive_array(len, buffer.finish(), null_count, null_buffer) }) } +/// Applies the provided binary operation across `a` and `b`, collecting the optional results +/// into a [`PrimitiveArray`]. If any index is null in either `a` or `b`, the corresponding +/// index in the result will also be null. The binary operation could return `None` which +/// results in a new null in the collected [`PrimitiveArray`]. +/// +/// The function is only evaluated for non-null indices +/// +/// # Panic +/// +/// Panics if the arrays have different lengths +pub(crate) fn binary_opt( + a: &PrimitiveArray, + b: &PrimitiveArray, + op: F, +) -> PrimitiveArray +where + A: ArrowPrimitiveType, + B: ArrowPrimitiveType, + O: ArrowPrimitiveType, + F: Fn(A::Native, B::Native) -> Option, +{ + assert_eq!(a.len(), b.len()); + + if a.is_empty() { + return PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE)); + } + + if a.null_count() == 0 && b.null_count() == 0 { + a.values() + .iter() + .zip(b.values().iter()) + .map(|(a, b)| op(*a, *b)) + .collect() + } else { + let iter_a = ArrayIter::new(a); + let iter_b = ArrayIter::new(b); + + let values = + iter_a + .into_iter() + .zip(iter_b.into_iter()) + .map(|(item_a, item_b)| { + if let (Some(a), Some(b)) = (item_a, item_b) { + op(a, b) + } else { + None + } + }); + + values.collect() + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow/src/compute/kernels/bitwise.rs b/arrow/src/compute/kernels/bitwise.rs index 18b9f4bb760c..2f3c9e490f4c 100644 --- a/arrow/src/compute/kernels/bitwise.rs +++ b/arrow/src/compute/kernels/bitwise.rs @@ -19,7 +19,7 @@ use crate::array::PrimitiveArray; use crate::compute::{binary, unary}; use crate::datatypes::ArrowNumericType; use crate::error::{ArrowError, Result}; -use std::ops::BitAnd; +use std::ops::{BitAnd, BitOr, BitXor, Not}; // The helper function for bitwise operation with two array fn bitwise_op( @@ -52,7 +52,43 @@ where bitwise_op(left, right, |a, b| a & b) } -/// Perform bitwise and every value in an array with the scalar. If any value in the array is null then the +/// Perform `left | right` operation on two arrays. If either left or right value is null +/// then the result is also null. +pub fn bitwise_or( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result> +where + T: ArrowNumericType, + T::Native: BitOr, +{ + bitwise_op(left, right, |a, b| a | b) +} + +/// Perform `left ^ right` operation on two arrays. If either left or right value is null +/// then the result is also null. +pub fn bitwise_xor( + left: &PrimitiveArray, + right: &PrimitiveArray, +) -> Result> +where + T: ArrowNumericType, + T::Native: BitXor, +{ + bitwise_op(left, right, |a, b| a ^ b) +} + +/// Perform `!array` operation on array. If array value is null +/// then the result is also null. +pub fn bitwise_not(array: &PrimitiveArray) -> Result> +where + T: ArrowNumericType, + T::Native: Not, +{ + Ok(unary(array, |value| !value)) +} + +/// Perform bitwise `and` every value in an array with the scalar. If any value in the array is null then the /// result is also null. pub fn bitwise_and_scalar( array: &PrimitiveArray, @@ -65,10 +101,39 @@ where Ok(unary(array, |value| value & scalar)) } +/// Perform bitwise `or` every value in an array with the scalar. If any value in the array is null then the +/// result is also null. +pub fn bitwise_or_scalar( + array: &PrimitiveArray, + scalar: T::Native, +) -> Result> +where + T: ArrowNumericType, + T::Native: BitOr, +{ + Ok(unary(array, |value| value | scalar)) +} + +/// Perform bitwise `xor` every value in an array with the scalar. If any value in the array is null then the +/// result is also null. +pub fn bitwise_xor_scalar( + array: &PrimitiveArray, + scalar: T::Native, +) -> Result> +where + T: ArrowNumericType, + T::Native: BitXor, +{ + Ok(unary(array, |value| value ^ scalar)) +} + #[cfg(test)] mod tests { use crate::array::{Int32Array, UInt64Array}; - use crate::compute::kernels::bitwise::{bitwise_and, bitwise_and_scalar}; + use crate::compute::kernels::bitwise::{ + bitwise_and, bitwise_and_scalar, bitwise_not, bitwise_or, bitwise_or_scalar, + bitwise_xor, bitwise_xor_scalar, + }; use crate::error::Result; #[test] @@ -82,7 +147,7 @@ mod tests { // signed value let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); - let right = Int32Array::from(vec![Some(5), Some(10), Some(8), Some(12)]); + let right = Int32Array::from(vec![Some(5), Some(-10), Some(8), Some(12)]); let expected = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); let result = bitwise_and(&left, &right)?; assert_eq!(expected, result); @@ -100,10 +165,102 @@ mod tests { // signed value let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); - let scalar = 20; + let scalar = -20; let expected = Int32Array::from(vec![Some(0), Some(0), None, Some(4)]); let result = bitwise_and_scalar(&left, scalar)?; assert_eq!(expected, result); Ok(()) } + + #[test] + fn test_bitwise_or_array() -> Result<()> { + // unsigned value + let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4)]); + let right = UInt64Array::from(vec![Some(7), Some(5), Some(8), Some(13)]); + let expected = UInt64Array::from(vec![Some(7), Some(7), None, Some(13)]); + let result = bitwise_or(&left, &right)?; + assert_eq!(expected, result); + + // signed value + let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); + let right = Int32Array::from(vec![Some(-7), Some(-5), Some(8), Some(13)]); + let expected = Int32Array::from(vec![Some(-7), Some(-5), None, Some(13)]); + let result = bitwise_or(&left, &right)?; + assert_eq!(expected, result); + Ok(()) + } + + #[test] + fn test_bitwise_not_array() -> Result<()> { + // unsigned value + let array = UInt64Array::from(vec![Some(1), Some(2), None, Some(4)]); + let expected = UInt64Array::from(vec![ + Some(18446744073709551614), + Some(18446744073709551613), + None, + Some(18446744073709551611), + ]); + let result = bitwise_not(&array)?; + assert_eq!(expected, result); + // signed value + let array = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); + let expected = Int32Array::from(vec![Some(-2), Some(-3), None, Some(-5)]); + let result = bitwise_not(&array)?; + assert_eq!(expected, result); + Ok(()) + } + + #[test] + fn test_bitwise_or_array_scalar() -> Result<()> { + // unsigned value + let left = UInt64Array::from(vec![Some(15), Some(2), None, Some(4)]); + let scalar = 7; + let expected = UInt64Array::from(vec![Some(15), Some(7), None, Some(7)]); + let result = bitwise_or_scalar(&left, scalar)?; + assert_eq!(expected, result); + + // signed value + let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); + let scalar = 20; + let expected = Int32Array::from(vec![Some(21), Some(22), None, Some(20)]); + let result = bitwise_or_scalar(&left, scalar)?; + assert_eq!(expected, result); + Ok(()) + } + + #[test] + fn test_bitwise_xor_array() -> Result<()> { + // unsigned value + let left = UInt64Array::from(vec![Some(1), Some(2), None, Some(4)]); + let right = UInt64Array::from(vec![Some(7), Some(5), Some(8), Some(13)]); + let expected = UInt64Array::from(vec![Some(6), Some(7), None, Some(9)]); + let result = bitwise_xor(&left, &right)?; + assert_eq!(expected, result); + + // signed value + let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); + let right = Int32Array::from(vec![Some(-7), Some(5), Some(8), Some(-13)]); + let expected = Int32Array::from(vec![Some(-8), Some(7), None, Some(-9)]); + let result = bitwise_xor(&left, &right)?; + assert_eq!(expected, result); + Ok(()) + } + + #[test] + fn test_bitwise_xor_array_scalar() -> Result<()> { + // unsigned value + let left = UInt64Array::from(vec![Some(15), Some(2), None, Some(4)]); + let scalar = 7; + let expected = UInt64Array::from(vec![Some(8), Some(5), None, Some(3)]); + let result = bitwise_xor_scalar(&left, scalar)?; + assert_eq!(expected, result); + + // signed value + let left = Int32Array::from(vec![Some(1), Some(2), None, Some(4)]); + let scalar = -20; + let expected = Int32Array::from(vec![Some(-19), Some(-18), None, Some(-24)]); + let result = bitwise_xor_scalar(&left, scalar)?; + assert_eq!(expected, result); + Ok(()) + } } diff --git a/arrow/src/datatypes/decimal.rs b/arrow/src/datatypes/decimal.rs index a7f3f3c1dc10..2ca71ef77725 100644 --- a/arrow/src/datatypes/decimal.rs +++ b/arrow/src/datatypes/decimal.rs @@ -15,10 +15,256 @@ // specific language governing permissions and limitations // under the License. -use crate::error::{ArrowError, Result}; -use crate::util::decimal::singed_cmp_le_bytes; use num::BigInt; use std::cmp::Ordering; +use std::fmt; + +use crate::error::{ArrowError, Result}; +use crate::util::decimal::singed_cmp_le_bytes; + +use super::Field; + +/// The set of datatypes that are supported by this implementation of Apache Arrow. +/// +/// The Arrow specification on data types includes some more types. +/// See also [`Schema.fbs`](https://github.com/apache/arrow/blob/master/format/Schema.fbs) +/// for Arrow's specification. +/// +/// The variants of this enum include primitive fixed size types as well as parametric or +/// nested types. +/// Currently the Rust implementation supports the following nested types: +/// - `List` +/// - `Struct` +/// +/// Nested types can themselves be nested within other arrays. +/// For more information on these types please see +/// [the physical memory layout of Apache Arrow](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout). +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub enum DataType { + /// Null type + Null, + /// A boolean datatype representing the values `true` and `false`. + Boolean, + /// A signed 8-bit integer. + Int8, + /// A signed 16-bit integer. + Int16, + /// A signed 32-bit integer. + Int32, + /// A signed 64-bit integer. + Int64, + /// An unsigned 8-bit integer. + UInt8, + /// An unsigned 16-bit integer. + UInt16, + /// An unsigned 32-bit integer. + UInt32, + /// An unsigned 64-bit integer. + UInt64, + /// A 16-bit floating point number. + Float16, + /// A 32-bit floating point number. + Float32, + /// A 64-bit floating point number. + Float64, + /// A timestamp with an optional timezone. + /// + /// Time is measured as a Unix epoch, counting the seconds from + /// 00:00:00.000 on 1 January 1970, excluding leap seconds, + /// as a 64-bit integer. + /// + /// The time zone is a string indicating the name of a time zone, one of: + /// + /// * As used in the Olson time zone database (the "tz database" or + /// "tzdata"), such as "America/New_York" + /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + /// + /// Timestamps with a non-empty timezone + /// ------------------------------------ + /// + /// If a Timestamp column has a non-empty timezone value, its epoch is + /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone + /// (the Unix epoch), regardless of the Timestamp's own timezone. + /// + /// Therefore, timestamp values with a non-empty timezone correspond to + /// physical points in time together with some additional information about + /// how the data was obtained and/or how to display it (the timezone). + /// + /// For example, the timestamp value 0 with the timezone string "Europe/Paris" + /// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the + /// application may prefer to display it as "January 1st 1970, 01h00" in + /// the Europe/Paris timezone (which is the same physical point in time). + /// + /// One consequence is that timestamp values with a non-empty timezone + /// can be compared and ordered directly, since they all share the same + /// well-known point of reference (the Unix epoch). + /// + /// Timestamps with an unset / empty timezone + /// ----------------------------------------- + /// + /// If a Timestamp column has no timezone value, its epoch is + /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone. + /// + /// Therefore, timestamp values without a timezone cannot be meaningfully + /// interpreted as physical points in time, but only as calendar / clock + /// indications ("wall clock time") in an unspecified timezone. + /// + /// For example, the timestamp value 0 with an empty timezone string + /// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there + /// is not enough information to interpret it as a well-defined physical + /// point in time. + /// + /// One consequence is that timestamp values without a timezone cannot + /// be reliably compared or ordered, since they may have different points of + /// reference. In particular, it is *not* possible to interpret an unset + /// or empty timezone as the same as "UTC". + /// + /// Conversion between timezones + /// ---------------------------- + /// + /// If a Timestamp column has a non-empty timezone, changing the timezone + /// to a different non-empty value is a metadata-only operation: + /// the timestamp values need not change as their point of reference remains + /// the same (the Unix epoch). + /// + /// However, if a Timestamp column has no timezone value, changing it to a + /// non-empty value requires to think about the desired semantics. + /// One possibility is to assume that the original timestamp values are + /// relative to the epoch of the timezone being set; timestamp values should + /// then adjusted to the Unix epoch (for example, changing the timezone from + /// empty to "Europe/Paris" would require converting the timestamp values + /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is + /// nevertheless correct). + Timestamp(TimeUnit, Option), + /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) + /// in days (32 bits). + Date32, + /// A 64-bit date representing the elapsed time since UNIX epoch (1970-01-01) + /// in milliseconds (64 bits). Values are evenly divisible by 86400000. + Date64, + /// A 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. + Time32(TimeUnit), + /// A 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. + Time64(TimeUnit), + /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. + Duration(TimeUnit), + /// A "calendar" interval which models types that don't necessarily + /// have a precise duration without the context of a base timestamp (e.g. + /// days can differ in length during day light savings time transitions). + Interval(IntervalUnit), + /// Opaque binary data of variable length. + Binary, + /// Opaque binary data of fixed size. + /// Enum parameter specifies the number of bytes per value. + FixedSizeBinary(i32), + /// Opaque binary data of variable length and 64-bit offsets. + LargeBinary, + /// A variable-length string in Unicode with UTF-8 encoding. + Utf8, + /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. + LargeUtf8, + /// A list of some logical data type with variable length. + List(Box), + /// A list of some logical data type with fixed length. + FixedSizeList(Box, i32), + /// A list of some logical data type with variable length and 64-bit offsets. + LargeList(Box), + /// A nested datatype that contains a number of sub-fields. + Struct(Vec), + /// A nested datatype that can represent slots of differing types. Components: + /// + /// 1. [`Field`] for each possible child type the Union can hold + /// 2. The corresponding `type_id` used to identify which Field + /// 3. The type of union (Sparse or Dense) + Union(Vec, Vec, UnionMode), + /// A dictionary encoded array (`key_type`, `value_type`), where + /// each array element is an index of `key_type` into an + /// associated dictionary of `value_type`. + /// + /// Dictionary arrays are used to store columns of `value_type` + /// that contain many repeated values using less memory, but with + /// a higher CPU overhead for some operations. + /// + /// This type mostly used to represent low cardinality string + /// arrays or a limited set of primitive types as integers. + Dictionary(Box, Box), + /// Exact 128-bit width decimal value with precision and scale + /// + /// * precision is the total number of digits + /// * scale is the number of digits past the decimal + /// + /// For example the number 123.45 has precision 5 and scale 2. + Decimal128(u8, u8), + /// Exact 256-bit width decimal value with precision and scale + /// + /// * precision is the total number of digits + /// * scale is the number of digits past the decimal + /// + /// For example the number 123.45 has precision 5 and scale 2. + Decimal256(u8, u8), + /// A Map is a logical nested type that is represented as + /// + /// `List>` + /// + /// The keys and values are each respectively contiguous. + /// The key and value types are not constrained, but keys should be + /// hashable and unique. + /// Whether the keys are sorted can be set in the `bool` after the `Field`. + /// + /// In a field with Map type, the field has a child Struct field, which then + /// has two children: key type and the second the value type. The names of the + /// child fields may be respectively "entries", "key", and "value", but this is + /// not enforced. + Map(Box, bool), +} + +/// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub enum TimeUnit { + /// Time in seconds. + Second, + /// Time in milliseconds. + Millisecond, + /// Time in microseconds. + Microsecond, + /// Time in nanoseconds. + Nanosecond, +} + +/// YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO interval in SQL style. +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub enum IntervalUnit { + /// Indicates the number of elapsed whole months, stored as 4-byte integers. + YearMonth, + /// Indicates the number of elapsed days and milliseconds, + /// stored as 2 contiguous 32-bit integers (days, milliseconds) (8-bytes in total). + DayTime, + /// A triple of the number of elapsed months, days, and nanoseconds. + /// The values are stored contiguously in 16 byte blocks. Months and + /// days are encoded as 32 bit integers and nanoseconds is encoded as a + /// 64 bit integer. All integers are signed. Each field is independent + /// (e.g. there is no constraint that nanoseconds have the same sign + /// as days or that the quantity of nanoseconds represents less + /// than a day's worth of time). + MonthDayNano, +} + +// Sparse or Dense union layouts +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub enum UnionMode { + Sparse, + Dense, +} + +impl fmt::Display for DataType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self) + } +} // MAX decimal256 value of little-endian format for each precision. // Each element is the max value of signed 256-bit integer for the specified precision which @@ -641,7 +887,7 @@ pub(crate) const MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION: [[u8; 32]; 76] = [ ]; /// `MAX_DECIMAL_FOR_EACH_PRECISION[p]` holds the maximum `i128` value -/// that can be stored in `DataType::Decimal128` value of precision `p` +/// that can be stored in [DataType::Decimal128] value of precision `p` pub const MAX_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ 9, 99, @@ -684,7 +930,7 @@ pub const MAX_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ ]; /// `MIN_DECIMAL_FOR_EACH_PRECISION[p]` holds the minimum `i128` value -/// that can be stored in a `DataType::Decimal128` value of precision `p` +/// that can be stored in a [DataType::Decimal128] value of precision `p` pub const MIN_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ -9, -99, @@ -726,19 +972,19 @@ pub const MIN_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ -99999999999999999999999999999999999999, ]; -/// The maximum precision for `DataType::Decimal128` values +/// The maximum precision for [DataType::Decimal128] values pub const DECIMAL128_MAX_PRECISION: u8 = 38; -/// The maximum scale for `DataType::Decimal128` values +/// The maximum scale for [DataType::Decimal128] values pub const DECIMAL128_MAX_SCALE: u8 = 38; -/// The maximum precision for `DataType::Decimal256` values +/// The maximum precision for [DataType::Decimal256] values pub const DECIMAL256_MAX_PRECISION: u8 = 76; -/// The maximum scale for `DataType::Decimal256` values +/// The maximum scale for [DataType::Decimal256] values pub const DECIMAL256_MAX_SCALE: u8 = 76; -/// The default scale for `DataType::Decimal128` and `DataType::Decimal256` values +/// The default scale for [DataType::Decimal128] and [DataType::Decimal256] values pub const DECIMAL_DEFAULT_SCALE: u8 = 10; /// Validates that the specified `i128` value can be properly @@ -805,9 +1051,100 @@ pub(crate) fn validate_decimal256_precision_with_lt_bytes( } } +impl DataType { + /// Returns true if this type is numeric: (UInt*, Int*, or Float*). + pub fn is_numeric(t: &DataType) -> bool { + use DataType::*; + matches!( + t, + UInt8 + | UInt16 + | UInt32 + | UInt64 + | Int8 + | Int16 + | Int32 + | Int64 + | Float32 + | Float64 + ) + } + + /// Returns true if this type is temporal: (Date*, Time*, Duration, or Interval). + pub fn is_temporal(t: &DataType) -> bool { + use DataType::*; + matches!( + t, + Date32 + | Date64 + | Timestamp(_, _) + | Time32(_) + | Time64(_) + | Duration(_) + | Interval(_) + ) + } + + /// Returns true if this type is valid as a dictionary key + /// (e.g. [`super::ArrowDictionaryKeyType`] + pub fn is_dictionary_key_type(t: &DataType) -> bool { + use DataType::*; + matches!( + t, + UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 + ) + } + + /// Returns true if this type is nested (List, FixedSizeList, LargeList, Struct, Union, or Map) + pub fn is_nested(t: &DataType) -> bool { + use DataType::*; + matches!( + t, + List(_) + | FixedSizeList(_, _) + | LargeList(_) + | Struct(_) + | Union(_, _, _) + | Map(_, _) + ) + } + + /// Compares the datatype with another, ignoring nested field names + /// and metadata. + pub fn equals_datatype(&self, other: &DataType) -> bool { + match (&self, other) { + (DataType::List(a), DataType::List(b)) + | (DataType::LargeList(a), DataType::LargeList(b)) => { + a.is_nullable() == b.is_nullable() + && a.data_type().equals_datatype(b.data_type()) + } + (DataType::FixedSizeList(a, a_size), DataType::FixedSizeList(b, b_size)) => { + a_size == b_size + && a.is_nullable() == b.is_nullable() + && a.data_type().equals_datatype(b.data_type()) + } + (DataType::Struct(a), DataType::Struct(b)) => { + a.len() == b.len() + && a.iter().zip(b).all(|(a, b)| { + a.is_nullable() == b.is_nullable() + && a.data_type().equals_datatype(b.data_type()) + }) + } + ( + DataType::Map(a_field, a_is_sorted), + DataType::Map(b_field, b_is_sorted), + ) => a_field == b_field && a_is_sorted == b_is_sorted, + _ => self == other, + } + } +} + #[cfg(test)] mod test { - use super::*; + use crate::datatypes::datatype::{ + MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION, + MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION, + }; use crate::util::decimal::Decimal256; use num::{BigInt, Num}; diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index 693fb40feae6..090907fd37dd 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -55,7 +55,7 @@ mod tests { #[cfg(feature = "json")] use serde_json::{ - Number, Value, + Number, Value::{Bool, Number as VNumber, String as VString}, }; @@ -172,942 +172,6 @@ mod tests { assert_eq!(person, deserialized); } - #[test] - #[cfg(feature = "json")] - fn struct_field_to_json() { - let f = Field::new( - "address", - DataType::Struct(vec![ - Field::new("street", DataType::Utf8, false), - Field::new("zip", DataType::UInt16, false), - ]), - false, - ); - let value: Value = serde_json::from_str( - r#"{ - "name": "address", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [ - { - "name": "street", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "zip", - "nullable": false, - "type": { - "name": "int", - "bitWidth": 16, - "isSigned": false - }, - "children": [] - } - ] - }"#, - ) - .unwrap(); - assert_eq!(value, f.to_json()); - } - - #[test] - #[cfg(feature = "json")] - fn map_field_to_json() { - let f = Field::new( - "my_map", - DataType::Map( - Box::new(Field::new( - "my_entries", - DataType::Struct(vec![ - Field::new("my_keys", DataType::Utf8, false), - Field::new("my_values", DataType::UInt16, true), - ]), - false, - )), - true, - ), - false, - ); - let value: Value = serde_json::from_str( - r#"{ - "name": "my_map", - "nullable": false, - "type": { - "name": "map", - "keysSorted": true - }, - "children": [ - { - "name": "my_entries", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [ - { - "name": "my_keys", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "my_values", - "nullable": true, - "type": { - "name": "int", - "bitWidth": 16, - "isSigned": false - }, - "children": [] - } - ] - } - ] - }"#, - ) - .unwrap(); - assert_eq!(value, f.to_json()); - } - - #[test] - #[cfg(feature = "json")] - fn primitive_field_to_json() { - let f = Field::new("first_name", DataType::Utf8, false); - let value: Value = serde_json::from_str( - r#"{ - "name": "first_name", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }"#, - ) - .unwrap(); - assert_eq!(value, f.to_json()); - } - #[test] - #[cfg(feature = "json")] - fn parse_struct_from_json() { - let json = r#" - { - "name": "address", - "type": { - "name": "struct" - }, - "nullable": false, - "children": [ - { - "name": "street", - "type": { - "name": "utf8" - }, - "nullable": false, - "children": [] - }, - { - "name": "zip", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 16 - }, - "nullable": false, - "children": [] - } - ] - } - "#; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = Field::from(&value).unwrap(); - - let expected = Field::new( - "address", - DataType::Struct(vec![ - Field::new("street", DataType::Utf8, false), - Field::new("zip", DataType::UInt16, false), - ]), - false, - ); - - assert_eq!(expected, dt); - } - - #[test] - #[cfg(feature = "json")] - fn parse_map_from_json() { - let json = r#" - { - "name": "my_map", - "nullable": false, - "type": { - "name": "map", - "keysSorted": true - }, - "children": [ - { - "name": "my_entries", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [ - { - "name": "my_keys", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "my_values", - "nullable": true, - "type": { - "name": "int", - "bitWidth": 16, - "isSigned": false - }, - "children": [] - } - ] - } - ] - } - "#; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = Field::from(&value).unwrap(); - - let expected = Field::new( - "my_map", - DataType::Map( - Box::new(Field::new( - "my_entries", - DataType::Struct(vec![ - Field::new("my_keys", DataType::Utf8, false), - Field::new("my_values", DataType::UInt16, true), - ]), - false, - )), - true, - ), - false, - ); - - assert_eq!(expected, dt); - } - - #[test] - #[cfg(feature = "json")] - fn parse_union_from_json() { - let json = r#" - { - "name": "my_union", - "nullable": false, - "type": { - "name": "union", - "mode": "SPARSE", - "typeIds": [ - 5, - 7 - ] - }, - "children": [ - { - "name": "f1", - "type": { - "name": "int", - "isSigned": true, - "bitWidth": 32 - }, - "nullable": true, - "children": [] - }, - { - "name": "f2", - "type": { - "name": "utf8" - }, - "nullable": true, - "children": [] - } - ] - } - "#; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = Field::from(&value).unwrap(); - - let expected = Field::new( - "my_union", - DataType::Union( - vec![ - Field::new("f1", DataType::Int32, true), - Field::new("f2", DataType::Utf8, true), - ], - vec![5, 7], - UnionMode::Sparse, - ), - false, - ); - - assert_eq!(expected, dt); - } - - #[test] - #[cfg(feature = "json")] - fn parse_utf8_from_json() { - let json = "{\"name\":\"utf8\"}"; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = DataType::from(&value).unwrap(); - assert_eq!(DataType::Utf8, dt); - } - - #[test] - #[cfg(feature = "json")] - fn parse_int32_from_json() { - let json = "{\"name\": \"int\", \"isSigned\": true, \"bitWidth\": 32}"; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = DataType::from(&value).unwrap(); - assert_eq!(DataType::Int32, dt); - } - - #[test] - #[cfg(feature = "json")] - fn schema_json() { - // Add some custom metadata - let metadata: HashMap = - [("Key".to_string(), "Value".to_string())] - .iter() - .cloned() - .collect(); - - let schema = Schema::new_with_metadata( - vec![ - Field::new("c1", DataType::Utf8, false), - Field::new("c2", DataType::Binary, false), - Field::new("c3", DataType::FixedSizeBinary(3), false), - Field::new("c4", DataType::Boolean, false), - Field::new("c5", DataType::Date32, false), - Field::new("c6", DataType::Date64, false), - Field::new("c7", DataType::Time32(TimeUnit::Second), false), - Field::new("c8", DataType::Time32(TimeUnit::Millisecond), false), - Field::new("c9", DataType::Time32(TimeUnit::Microsecond), false), - Field::new("c10", DataType::Time32(TimeUnit::Nanosecond), false), - Field::new("c11", DataType::Time64(TimeUnit::Second), false), - Field::new("c12", DataType::Time64(TimeUnit::Millisecond), false), - Field::new("c13", DataType::Time64(TimeUnit::Microsecond), false), - Field::new("c14", DataType::Time64(TimeUnit::Nanosecond), false), - Field::new("c15", DataType::Timestamp(TimeUnit::Second, None), false), - Field::new( - "c16", - DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".to_string())), - false, - ), - Field::new( - "c17", - DataType::Timestamp( - TimeUnit::Microsecond, - Some("Africa/Johannesburg".to_string()), - ), - false, - ), - Field::new( - "c18", - DataType::Timestamp(TimeUnit::Nanosecond, None), - false, - ), - Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false), - Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), - Field::new("c21", DataType::Interval(IntervalUnit::MonthDayNano), false), - Field::new( - "c22", - DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), - false, - ), - Field::new( - "c23", - DataType::FixedSizeList( - Box::new(Field::new("bools", DataType::Boolean, false)), - 5, - ), - false, - ), - Field::new( - "c24", - DataType::List(Box::new(Field::new( - "inner_list", - DataType::List(Box::new(Field::new( - "struct", - DataType::Struct(vec![]), - true, - ))), - false, - ))), - true, - ), - Field::new( - "c25", - DataType::Struct(vec![ - Field::new("a", DataType::Utf8, false), - Field::new("b", DataType::UInt16, false), - ]), - false, - ), - Field::new("c26", DataType::Interval(IntervalUnit::YearMonth), true), - Field::new("c27", DataType::Interval(IntervalUnit::DayTime), true), - Field::new("c28", DataType::Interval(IntervalUnit::MonthDayNano), true), - Field::new("c29", DataType::Duration(TimeUnit::Second), false), - Field::new("c30", DataType::Duration(TimeUnit::Millisecond), false), - Field::new("c31", DataType::Duration(TimeUnit::Microsecond), false), - Field::new("c32", DataType::Duration(TimeUnit::Nanosecond), false), - Field::new_dict( - "c33", - DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::Utf8), - ), - true, - 123, - true, - ), - Field::new("c34", DataType::LargeBinary, true), - Field::new("c35", DataType::LargeUtf8, true), - Field::new( - "c36", - DataType::LargeList(Box::new(Field::new( - "inner_large_list", - DataType::LargeList(Box::new(Field::new( - "struct", - DataType::Struct(vec![]), - false, - ))), - true, - ))), - true, - ), - Field::new( - "c37", - DataType::Map( - Box::new(Field::new( - "my_entries", - DataType::Struct(vec![ - Field::new("my_keys", DataType::Utf8, false), - Field::new("my_values", DataType::UInt16, true), - ]), - false, - )), - true, - ), - false, - ), - ], - metadata, - ); - - let expected = schema.to_json(); - let json = r#"{ - "fields": [ - { - "name": "c1", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "c2", - "nullable": false, - "type": { - "name": "binary" - }, - "children": [] - }, - { - "name": "c3", - "nullable": false, - "type": { - "name": "fixedsizebinary", - "byteWidth": 3 - }, - "children": [] - }, - { - "name": "c4", - "nullable": false, - "type": { - "name": "bool" - }, - "children": [] - }, - { - "name": "c5", - "nullable": false, - "type": { - "name": "date", - "unit": "DAY" - }, - "children": [] - }, - { - "name": "c6", - "nullable": false, - "type": { - "name": "date", - "unit": "MILLISECOND" - }, - "children": [] - }, - { - "name": "c7", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 32, - "unit": "SECOND" - }, - "children": [] - }, - { - "name": "c8", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 32, - "unit": "MILLISECOND" - }, - "children": [] - }, - { - "name": "c9", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 32, - "unit": "MICROSECOND" - }, - "children": [] - }, - { - "name": "c10", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 32, - "unit": "NANOSECOND" - }, - "children": [] - }, - { - "name": "c11", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 64, - "unit": "SECOND" - }, - "children": [] - }, - { - "name": "c12", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 64, - "unit": "MILLISECOND" - }, - "children": [] - }, - { - "name": "c13", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 64, - "unit": "MICROSECOND" - }, - "children": [] - }, - { - "name": "c14", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 64, - "unit": "NANOSECOND" - }, - "children": [] - }, - { - "name": "c15", - "nullable": false, - "type": { - "name": "timestamp", - "unit": "SECOND" - }, - "children": [] - }, - { - "name": "c16", - "nullable": false, - "type": { - "name": "timestamp", - "unit": "MILLISECOND", - "timezone": "UTC" - }, - "children": [] - }, - { - "name": "c17", - "nullable": false, - "type": { - "name": "timestamp", - "unit": "MICROSECOND", - "timezone": "Africa/Johannesburg" - }, - "children": [] - }, - { - "name": "c18", - "nullable": false, - "type": { - "name": "timestamp", - "unit": "NANOSECOND" - }, - "children": [] - }, - { - "name": "c19", - "nullable": false, - "type": { - "name": "interval", - "unit": "DAY_TIME" - }, - "children": [] - }, - { - "name": "c20", - "nullable": false, - "type": { - "name": "interval", - "unit": "YEAR_MONTH" - }, - "children": [] - }, - { - "name": "c21", - "nullable": false, - "type": { - "name": "interval", - "unit": "MONTH_DAY_NANO" - }, - "children": [] - }, - { - "name": "c22", - "nullable": false, - "type": { - "name": "list" - }, - "children": [ - { - "name": "item", - "nullable": true, - "type": { - "name": "bool" - }, - "children": [] - } - ] - }, - { - "name": "c23", - "nullable": false, - "type": { - "name": "fixedsizelist", - "listSize": 5 - }, - "children": [ - { - "name": "bools", - "nullable": false, - "type": { - "name": "bool" - }, - "children": [] - } - ] - }, - { - "name": "c24", - "nullable": true, - "type": { - "name": "list" - }, - "children": [ - { - "name": "inner_list", - "nullable": false, - "type": { - "name": "list" - }, - "children": [ - { - "name": "struct", - "nullable": true, - "type": { - "name": "struct" - }, - "children": [] - } - ] - } - ] - }, - { - "name": "c25", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [ - { - "name": "a", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "b", - "nullable": false, - "type": { - "name": "int", - "bitWidth": 16, - "isSigned": false - }, - "children": [] - } - ] - }, - { - "name": "c26", - "nullable": true, - "type": { - "name": "interval", - "unit": "YEAR_MONTH" - }, - "children": [] - }, - { - "name": "c27", - "nullable": true, - "type": { - "name": "interval", - "unit": "DAY_TIME" - }, - "children": [] - }, - { - "name": "c28", - "nullable": true, - "type": { - "name": "interval", - "unit": "MONTH_DAY_NANO" - }, - "children": [] - }, - { - "name": "c29", - "nullable": false, - "type": { - "name": "duration", - "unit": "SECOND" - }, - "children": [] - }, - { - "name": "c30", - "nullable": false, - "type": { - "name": "duration", - "unit": "MILLISECOND" - }, - "children": [] - }, - { - "name": "c31", - "nullable": false, - "type": { - "name": "duration", - "unit": "MICROSECOND" - }, - "children": [] - }, - { - "name": "c32", - "nullable": false, - "type": { - "name": "duration", - "unit": "NANOSECOND" - }, - "children": [] - }, - { - "name": "c33", - "nullable": true, - "children": [], - "type": { - "name": "utf8" - }, - "dictionary": { - "id": 123, - "indexType": { - "name": "int", - "bitWidth": 32, - "isSigned": true - }, - "isOrdered": true - } - }, - { - "name": "c34", - "nullable": true, - "type": { - "name": "largebinary" - }, - "children": [] - }, - { - "name": "c35", - "nullable": true, - "type": { - "name": "largeutf8" - }, - "children": [] - }, - { - "name": "c36", - "nullable": true, - "type": { - "name": "largelist" - }, - "children": [ - { - "name": "inner_large_list", - "nullable": true, - "type": { - "name": "largelist" - }, - "children": [ - { - "name": "struct", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [] - } - ] - } - ] - }, - { - "name": "c37", - "nullable": false, - "type": { - "name": "map", - "keysSorted": true - }, - "children": [ - { - "name": "my_entries", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [ - { - "name": "my_keys", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "my_values", - "nullable": true, - "type": { - "name": "int", - "bitWidth": 16, - "isSigned": false - }, - "children": [] - } - ] - } - ] - } - ], - "metadata" : { - "Key": "Value" - } - }"#; - let value: Value = serde_json::from_str(json).unwrap(); - assert_eq!(expected, value); - - // convert back to a schema - let value: Value = serde_json::from_str(json).unwrap(); - let schema2 = Schema::from(&value).unwrap(); - - assert_eq!(schema, schema2); - - // Check that empty metadata produces empty value in JSON and can be parsed - let json = r#"{ - "fields": [ - { - "name": "c1", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - } - ], - "metadata": {} - }"#; - let value: Value = serde_json::from_str(json).unwrap(); - let schema = Schema::from(&value).unwrap(); - assert!(schema.metadata.is_empty()); - - // Check that metadata field is not required in the JSON. - let json = r#"{ - "fields": [ - { - "name": "c1", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - } - ] - }"#; - let value: Value = serde_json::from_str(json).unwrap(); - let schema = Schema::from(&value).unwrap(); - assert!(schema.metadata.is_empty()); - } - #[test] fn create_schema_string() { let schema = person_schema(); diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index ffe112af72cd..2345f1967f24 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -50,7 +50,7 @@ pub fn read_json_file(json_name: &str) -> Result { let json_file = File::open(json_name)?; let reader = BufReader::new(json_file); let arrow_json: Value = serde_json::from_reader(reader).unwrap(); - let schema = Schema::from(&arrow_json["schema"])?; + let schema = schema_from_json(&arrow_json["schema"])?; // read dictionaries let mut dictionaries = HashMap::new(); if let Some(dicts) = arrow_json.get("dictionaries") { diff --git a/integration-testing/src/util/datatype.rs b/integration-testing/src/util/datatype.rs new file mode 100644 index 000000000000..dd0b95b0a836 --- /dev/null +++ b/integration-testing/src/util/datatype.rs @@ -0,0 +1,383 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit, UnionMode}; +use arrow::error::{ArrowError, Result}; + +/// Parse a data type from a JSON representation. +pub fn data_type_from_json(json: &serde_json::Value) -> Result { + use serde_json::Value; + let default_field = Field::new("", DataType::Boolean, true); + match *json { + Value::Object(ref map) => match map.get("name") { + Some(s) if s == "null" => Ok(DataType::Null), + Some(s) if s == "bool" => Ok(DataType::Boolean), + Some(s) if s == "binary" => Ok(DataType::Binary), + Some(s) if s == "largebinary" => Ok(DataType::LargeBinary), + Some(s) if s == "utf8" => Ok(DataType::Utf8), + Some(s) if s == "largeutf8" => Ok(DataType::LargeUtf8), + Some(s) if s == "fixedsizebinary" => { + // return a list with any type as its child isn't defined in the map + if let Some(Value::Number(size)) = map.get("byteWidth") { + Ok(DataType::FixedSizeBinary(size.as_i64().unwrap() as i32)) + } else { + Err(ArrowError::ParseError( + "Expecting a byteWidth for fixedsizebinary".to_string(), + )) + } + } + Some(s) if s == "decimal" => { + // return a list with any type as its child isn't defined in the map + let precision = match map.get("precision") { + Some(p) => Ok(p.as_u64().unwrap().try_into().unwrap()), + None => Err(ArrowError::ParseError( + "Expecting a precision for decimal".to_string(), + )), + }?; + let scale = match map.get("scale") { + Some(s) => Ok(s.as_u64().unwrap().try_into().unwrap()), + _ => Err(ArrowError::ParseError( + "Expecting a scale for decimal".to_string(), + )), + }?; + let bit_width: usize = match map.get("bitWidth") { + Some(b) => b.as_u64().unwrap() as usize, + _ => 128, // Default bit width + }; + + if bit_width == 128 { + Ok(DataType::Decimal128(precision, scale)) + } else if bit_width == 256 { + Ok(DataType::Decimal256(precision, scale)) + } else { + Err(ArrowError::ParseError( + "Decimal bit_width invalid".to_string(), + )) + } + } + Some(s) if s == "floatingpoint" => match map.get("precision") { + Some(p) if p == "HALF" => Ok(DataType::Float16), + Some(p) if p == "SINGLE" => Ok(DataType::Float32), + Some(p) if p == "DOUBLE" => Ok(DataType::Float64), + _ => Err(ArrowError::ParseError( + "floatingpoint precision missing or invalid".to_string(), + )), + }, + Some(s) if s == "timestamp" => { + let unit = match map.get("unit") { + Some(p) if p == "SECOND" => Ok(TimeUnit::Second), + Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond), + Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond), + Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond), + _ => Err(ArrowError::ParseError( + "timestamp unit missing or invalid".to_string(), + )), + }; + let tz = match map.get("timezone") { + None => Ok(None), + Some(serde_json::Value::String(tz)) => Ok(Some(tz.clone())), + _ => Err(ArrowError::ParseError( + "timezone must be a string".to_string(), + )), + }; + Ok(DataType::Timestamp(unit?, tz?)) + } + Some(s) if s == "date" => match map.get("unit") { + Some(p) if p == "DAY" => Ok(DataType::Date32), + Some(p) if p == "MILLISECOND" => Ok(DataType::Date64), + _ => Err(ArrowError::ParseError( + "date unit missing or invalid".to_string(), + )), + }, + Some(s) if s == "time" => { + let unit = match map.get("unit") { + Some(p) if p == "SECOND" => Ok(TimeUnit::Second), + Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond), + Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond), + Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond), + _ => Err(ArrowError::ParseError( + "time unit missing or invalid".to_string(), + )), + }; + match map.get("bitWidth") { + Some(p) if p == 32 => Ok(DataType::Time32(unit?)), + Some(p) if p == 64 => Ok(DataType::Time64(unit?)), + _ => Err(ArrowError::ParseError( + "time bitWidth missing or invalid".to_string(), + )), + } + } + Some(s) if s == "duration" => match map.get("unit") { + Some(p) if p == "SECOND" => Ok(DataType::Duration(TimeUnit::Second)), + Some(p) if p == "MILLISECOND" => { + Ok(DataType::Duration(TimeUnit::Millisecond)) + } + Some(p) if p == "MICROSECOND" => { + Ok(DataType::Duration(TimeUnit::Microsecond)) + } + Some(p) if p == "NANOSECOND" => { + Ok(DataType::Duration(TimeUnit::Nanosecond)) + } + _ => Err(ArrowError::ParseError( + "time unit missing or invalid".to_string(), + )), + }, + Some(s) if s == "interval" => match map.get("unit") { + Some(p) if p == "DAY_TIME" => { + Ok(DataType::Interval(IntervalUnit::DayTime)) + } + Some(p) if p == "YEAR_MONTH" => { + Ok(DataType::Interval(IntervalUnit::YearMonth)) + } + Some(p) if p == "MONTH_DAY_NANO" => { + Ok(DataType::Interval(IntervalUnit::MonthDayNano)) + } + _ => Err(ArrowError::ParseError( + "interval unit missing or invalid".to_string(), + )), + }, + Some(s) if s == "int" => match map.get("isSigned") { + Some(&Value::Bool(true)) => match map.get("bitWidth") { + Some(&Value::Number(ref n)) => match n.as_u64() { + Some(8) => Ok(DataType::Int8), + Some(16) => Ok(DataType::Int16), + Some(32) => Ok(DataType::Int32), + Some(64) => Ok(DataType::Int64), + _ => Err(ArrowError::ParseError( + "int bitWidth missing or invalid".to_string(), + )), + }, + _ => Err(ArrowError::ParseError( + "int bitWidth missing or invalid".to_string(), + )), + }, + Some(&Value::Bool(false)) => match map.get("bitWidth") { + Some(&Value::Number(ref n)) => match n.as_u64() { + Some(8) => Ok(DataType::UInt8), + Some(16) => Ok(DataType::UInt16), + Some(32) => Ok(DataType::UInt32), + Some(64) => Ok(DataType::UInt64), + _ => Err(ArrowError::ParseError( + "int bitWidth missing or invalid".to_string(), + )), + }, + _ => Err(ArrowError::ParseError( + "int bitWidth missing or invalid".to_string(), + )), + }, + _ => Err(ArrowError::ParseError( + "int signed missing or invalid".to_string(), + )), + }, + Some(s) if s == "list" => { + // return a list with any type as its child isn't defined in the map + Ok(DataType::List(Box::new(default_field))) + } + Some(s) if s == "largelist" => { + // return a largelist with any type as its child isn't defined in the map + Ok(DataType::LargeList(Box::new(default_field))) + } + Some(s) if s == "fixedsizelist" => { + // return a list with any type as its child isn't defined in the map + if let Some(Value::Number(size)) = map.get("listSize") { + Ok(DataType::FixedSizeList( + Box::new(default_field), + size.as_i64().unwrap() as i32, + )) + } else { + Err(ArrowError::ParseError( + "Expecting a listSize for fixedsizelist".to_string(), + )) + } + } + Some(s) if s == "struct" => { + // return an empty `struct` type as its children aren't defined in the map + Ok(DataType::Struct(vec![])) + } + Some(s) if s == "map" => { + if let Some(Value::Bool(keys_sorted)) = map.get("keysSorted") { + // Return a map with an empty type as its children aren't defined in the map + Ok(DataType::Map(Box::new(default_field), *keys_sorted)) + } else { + Err(ArrowError::ParseError( + "Expecting a keysSorted for map".to_string(), + )) + } + } + Some(s) if s == "union" => { + if let Some(Value::String(mode)) = map.get("mode") { + let union_mode = if mode == "SPARSE" { + UnionMode::Sparse + } else if mode == "DENSE" { + UnionMode::Dense + } else { + return Err(ArrowError::ParseError(format!( + "Unknown union mode {:?} for union", + mode + ))); + }; + if let Some(type_ids) = map.get("typeIds") { + let type_ids = type_ids + .as_array() + .unwrap() + .iter() + .map(|t| t.as_i64().unwrap() as i8) + .collect::>(); + + let default_fields = type_ids + .iter() + .map(|_| default_field.clone()) + .collect::>(); + + Ok(DataType::Union(default_fields, type_ids, union_mode)) + } else { + Err(ArrowError::ParseError( + "Expecting a typeIds for union ".to_string(), + )) + } + } else { + Err(ArrowError::ParseError( + "Expecting a mode for union".to_string(), + )) + } + } + Some(other) => Err(ArrowError::ParseError(format!( + "invalid or unsupported type name: {} in {:?}", + other, json + ))), + None => Err(ArrowError::ParseError("type name missing".to_string())), + }, + _ => Err(ArrowError::ParseError( + "invalid json value type".to_string(), + )), + } +} + +/// Generate a JSON representation of the data type. +pub fn data_type_to_json(data_type: &DataType) -> serde_json::Value { + use serde_json::json; + match data_type { + DataType::Null => json!({"name": "null"}), + DataType::Boolean => json!({"name": "bool"}), + DataType::Int8 => json!({"name": "int", "bitWidth": 8, "isSigned": true}), + DataType::Int16 => json!({"name": "int", "bitWidth": 16, "isSigned": true}), + DataType::Int32 => json!({"name": "int", "bitWidth": 32, "isSigned": true}), + DataType::Int64 => json!({"name": "int", "bitWidth": 64, "isSigned": true}), + DataType::UInt8 => json!({"name": "int", "bitWidth": 8, "isSigned": false}), + DataType::UInt16 => json!({"name": "int", "bitWidth": 16, "isSigned": false}), + DataType::UInt32 => json!({"name": "int", "bitWidth": 32, "isSigned": false}), + DataType::UInt64 => json!({"name": "int", "bitWidth": 64, "isSigned": false}), + DataType::Float16 => json!({"name": "floatingpoint", "precision": "HALF"}), + DataType::Float32 => json!({"name": "floatingpoint", "precision": "SINGLE"}), + DataType::Float64 => json!({"name": "floatingpoint", "precision": "DOUBLE"}), + DataType::Utf8 => json!({"name": "utf8"}), + DataType::LargeUtf8 => json!({"name": "largeutf8"}), + DataType::Binary => json!({"name": "binary"}), + DataType::LargeBinary => json!({"name": "largebinary"}), + DataType::FixedSizeBinary(byte_width) => { + json!({"name": "fixedsizebinary", "byteWidth": byte_width}) + } + DataType::Struct(_) => json!({"name": "struct"}), + DataType::Union(_, _, _) => json!({"name": "union"}), + DataType::List(_) => json!({ "name": "list"}), + DataType::LargeList(_) => json!({ "name": "largelist"}), + DataType::FixedSizeList(_, length) => { + json!({"name":"fixedsizelist", "listSize": length}) + } + DataType::Time32(unit) => { + json!({"name": "time", "bitWidth": 32, "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }}) + } + DataType::Time64(unit) => { + json!({"name": "time", "bitWidth": 64, "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }}) + } + DataType::Date32 => { + json!({"name": "date", "unit": "DAY"}) + } + DataType::Date64 => { + json!({"name": "date", "unit": "MILLISECOND"}) + } + DataType::Timestamp(unit, None) => { + json!({"name": "timestamp", "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }}) + } + DataType::Timestamp(unit, Some(tz)) => { + json!({"name": "timestamp", "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }, "timezone": tz}) + } + DataType::Interval(unit) => json!({"name": "interval", "unit": match unit { + IntervalUnit::YearMonth => "YEAR_MONTH", + IntervalUnit::DayTime => "DAY_TIME", + IntervalUnit::MonthDayNano => "MONTH_DAY_NANO", + }}), + DataType::Duration(unit) => json!({"name": "duration", "unit": match unit { + TimeUnit::Second => "SECOND", + TimeUnit::Millisecond => "MILLISECOND", + TimeUnit::Microsecond => "MICROSECOND", + TimeUnit::Nanosecond => "NANOSECOND", + }}), + DataType::Dictionary(_, _) => json!({ "name": "dictionary"}), + DataType::Decimal128(precision, scale) => { + json!({"name": "decimal", "precision": precision, "scale": scale, "bitWidth": 128}) + } + DataType::Decimal256(precision, scale) => { + json!({"name": "decimal", "precision": precision, "scale": scale, "bitWidth": 256}) + } + DataType::Map(_, keys_sorted) => { + json!({"name": "map", "keysSorted": keys_sorted}) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::Value; + + #[test] + fn parse_utf8_from_json() { + let json = "{\"name\":\"utf8\"}"; + let value: Value = serde_json::from_str(json).unwrap(); + let dt = data_type_from_json(&value).unwrap(); + assert_eq!(DataType::Utf8, dt); + } + + #[test] + fn parse_int32_from_json() { + let json = "{\"name\": \"int\", \"isSigned\": true, \"bitWidth\": 32}"; + let value: Value = serde_json::from_str(json).unwrap(); + let dt = data_type_from_json(&value).unwrap(); + assert_eq!(DataType::Int32, dt); + } +} diff --git a/integration-testing/src/util/field.rs b/integration-testing/src/util/field.rs new file mode 100644 index 000000000000..a2becc004d13 --- /dev/null +++ b/integration-testing/src/util/field.rs @@ -0,0 +1,586 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::util::datatype::{data_type_from_json, data_type_to_json}; +use arrow::datatypes::{DataType, Field}; +use arrow::error::{ArrowError, Result}; +use std::collections::BTreeMap; + +/// Parse a `Field` definition from a JSON representation. +pub fn field_from_json(json: &serde_json::Value) -> Result { + use serde_json::Value; + match *json { + Value::Object(ref map) => { + let name = match map.get("name") { + Some(&Value::String(ref name)) => name.to_string(), + _ => { + return Err(ArrowError::ParseError( + "Field missing 'name' attribute".to_string(), + )); + } + }; + let nullable = match map.get("nullable") { + Some(&Value::Bool(b)) => b, + _ => { + return Err(ArrowError::ParseError( + "Field missing 'nullable' attribute".to_string(), + )); + } + }; + let data_type = match map.get("type") { + Some(t) => data_type_from_json(t)?, + _ => { + return Err(ArrowError::ParseError( + "Field missing 'type' attribute".to_string(), + )); + } + }; + + // Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz + let metadata = match map.get("metadata") { + Some(&Value::Array(ref values)) => { + let mut res: BTreeMap = BTreeMap::new(); + for value in values { + match value.as_object() { + Some(map) => { + if map.len() != 2 { + return Err(ArrowError::ParseError( + "Field 'metadata' must have exact two entries for each key-value map".to_string(), + )); + } + if let (Some(k), Some(v)) = + (map.get("key"), map.get("value")) + { + if let (Some(k_str), Some(v_str)) = + (k.as_str(), v.as_str()) + { + res.insert( + k_str.to_string().clone(), + v_str.to_string().clone(), + ); + } else { + return Err(ArrowError::ParseError("Field 'metadata' must have map value of string type".to_string())); + } + } else { + return Err(ArrowError::ParseError("Field 'metadata' lacks map keys named \"key\" or \"value\"".to_string())); + } + } + _ => { + return Err(ArrowError::ParseError( + "Field 'metadata' contains non-object key-value pair" + .to_string(), + )); + } + } + } + Some(res) + } + // We also support map format, because Schema's metadata supports this. + // See https://github.com/apache/arrow/pull/5907 + Some(&Value::Object(ref values)) => { + let mut res: BTreeMap = BTreeMap::new(); + for (k, v) in values { + if let Some(str_value) = v.as_str() { + res.insert(k.clone(), str_value.to_string().clone()); + } else { + return Err(ArrowError::ParseError(format!( + "Field 'metadata' contains non-string value for key {}", + k + ))); + } + } + Some(res) + } + Some(_) => { + return Err(ArrowError::ParseError( + "Field `metadata` is not json array".to_string(), + )); + } + _ => None, + }; + + // if data_type is a struct or list, get its children + let data_type = match data_type { + DataType::List(_) + | DataType::LargeList(_) + | DataType::FixedSizeList(_, _) => match map.get("children") { + Some(Value::Array(values)) => { + if values.len() != 1 { + return Err(ArrowError::ParseError( + "Field 'children' must have one element for a list data type".to_string(), + )); + } + match data_type { + DataType::List(_) => { + DataType::List(Box::new(field_from_json(&values[0])?)) + } + DataType::LargeList(_) => DataType::LargeList(Box::new( + field_from_json(&values[0])?, + )), + DataType::FixedSizeList(_, int) => DataType::FixedSizeList( + Box::new(field_from_json(&values[0])?), + int, + ), + _ => unreachable!( + "Data type should be a list, largelist or fixedsizelist" + ), + } + } + Some(_) => { + return Err(ArrowError::ParseError( + "Field 'children' must be an array".to_string(), + )) + } + None => { + return Err(ArrowError::ParseError( + "Field missing 'children' attribute".to_string(), + )); + } + }, + DataType::Struct(mut fields) => match map.get("children") { + Some(Value::Array(values)) => { + let struct_fields: Result> = + values.iter().map(field_from_json).collect(); + fields.append(&mut struct_fields?); + DataType::Struct(fields) + } + Some(_) => { + return Err(ArrowError::ParseError( + "Field 'children' must be an array".to_string(), + )) + } + None => { + return Err(ArrowError::ParseError( + "Field missing 'children' attribute".to_string(), + )); + } + }, + DataType::Map(_, keys_sorted) => { + match map.get("children") { + Some(Value::Array(values)) if values.len() == 1 => { + let child = field_from_json(&values[0])?; + // child must be a struct + match child.data_type() { + DataType::Struct(map_fields) if map_fields.len() == 2 => { + DataType::Map(Box::new(child), keys_sorted) + } + t => { + return Err(ArrowError::ParseError( + format!("Map children should be a struct with 2 fields, found {:?}", t) + )) + } + } + } + Some(_) => { + return Err(ArrowError::ParseError( + "Field 'children' must be an array with 1 element" + .to_string(), + )) + } + None => { + return Err(ArrowError::ParseError( + "Field missing 'children' attribute".to_string(), + )); + } + } + } + DataType::Union(_, type_ids, mode) => match map.get("children") { + Some(Value::Array(values)) => { + let union_fields: Vec = + values.iter().map(field_from_json).collect::>()?; + DataType::Union(union_fields, type_ids, mode) + } + Some(_) => { + return Err(ArrowError::ParseError( + "Field 'children' must be an array".to_string(), + )) + } + None => { + return Err(ArrowError::ParseError( + "Field missing 'children' attribute".to_string(), + )); + } + }, + _ => data_type, + }; + + let mut dict_id = 0; + let mut dict_is_ordered = false; + + let data_type = match map.get("dictionary") { + Some(dictionary) => { + let index_type = match dictionary.get("indexType") { + Some(t) => data_type_from_json(t)?, + _ => { + return Err(ArrowError::ParseError( + "Field missing 'indexType' attribute".to_string(), + )); + } + }; + dict_id = match dictionary.get("id") { + Some(Value::Number(n)) => n.as_i64().unwrap(), + _ => { + return Err(ArrowError::ParseError( + "Field missing 'id' attribute".to_string(), + )); + } + }; + dict_is_ordered = match dictionary.get("isOrdered") { + Some(&Value::Bool(n)) => n, + _ => { + return Err(ArrowError::ParseError( + "Field missing 'isOrdered' attribute".to_string(), + )); + } + }; + DataType::Dictionary(Box::new(index_type), Box::new(data_type)) + } + _ => data_type, + }; + + let mut field = + Field::new_dict(&name, data_type, nullable, dict_id, dict_is_ordered); + field.set_metadata(metadata); + Ok(field) + } + _ => Err(ArrowError::ParseError( + "Invalid json value type for field".to_string(), + )), + } +} + +/// Generate a JSON representation of the `Field`. +pub fn field_to_json(field: &Field) -> serde_json::Value { + let children: Vec = match field.data_type() { + DataType::Struct(fields) => fields.iter().map(field_to_json).collect(), + DataType::List(field) + | DataType::LargeList(field) + | DataType::FixedSizeList(field, _) + | DataType::Map(field, _) => vec![field_to_json(field)], + _ => vec![], + }; + + match field.data_type() { + DataType::Dictionary(ref index_type, ref value_type) => serde_json::json!({ + "name": field.name(), + "nullable": field.is_nullable(), + "type": data_type_to_json(value_type), + "children": children, + "dictionary": { + "id": field.dict_id().unwrap(), + "indexType": data_type_to_json(index_type), + "isOrdered": field.dict_is_ordered().unwrap(), + } + }), + _ => serde_json::json!({ + "name": field.name(), + "nullable": field.is_nullable(), + "type": data_type_to_json(field.data_type()), + "children": children + }), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::datatypes::UnionMode; + use serde_json::Value; + + #[test] + fn struct_field_to_json() { + let f = Field::new( + "address", + DataType::Struct(vec![ + Field::new("street", DataType::Utf8, false), + Field::new("zip", DataType::UInt16, false), + ]), + false, + ); + let value: Value = serde_json::from_str( + r#"{ + "name": "address", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [ + { + "name": "street", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "zip", + "nullable": false, + "type": { + "name": "int", + "bitWidth": 16, + "isSigned": false + }, + "children": [] + } + ] + }"#, + ) + .unwrap(); + assert_eq!(value, field_to_json(&f)); + } + + #[test] + fn map_field_to_json() { + let f = Field::new( + "my_map", + DataType::Map( + Box::new(Field::new( + "my_entries", + DataType::Struct(vec![ + Field::new("my_keys", DataType::Utf8, false), + Field::new("my_values", DataType::UInt16, true), + ]), + false, + )), + true, + ), + false, + ); + let value: Value = serde_json::from_str( + r#"{ + "name": "my_map", + "nullable": false, + "type": { + "name": "map", + "keysSorted": true + }, + "children": [ + { + "name": "my_entries", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [ + { + "name": "my_keys", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "my_values", + "nullable": true, + "type": { + "name": "int", + "bitWidth": 16, + "isSigned": false + }, + "children": [] + } + ] + } + ] + }"#, + ) + .unwrap(); + assert_eq!(value, field_to_json(&f)); + } + + #[test] + fn primitive_field_to_json() { + let f = Field::new("first_name", DataType::Utf8, false); + let value: Value = serde_json::from_str( + r#"{ + "name": "first_name", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }"#, + ) + .unwrap(); + assert_eq!(value, field_to_json(&f)); + } + #[test] + fn parse_struct_from_json() { + let json = r#" + { + "name": "address", + "type": { + "name": "struct" + }, + "nullable": false, + "children": [ + { + "name": "street", + "type": { + "name": "utf8" + }, + "nullable": false, + "children": [] + }, + { + "name": "zip", + "type": { + "name": "int", + "isSigned": false, + "bitWidth": 16 + }, + "nullable": false, + "children": [] + } + ] + } + "#; + let value: Value = serde_json::from_str(json).unwrap(); + let dt = field_from_json(&value).unwrap(); + + let expected = Field::new( + "address", + DataType::Struct(vec![ + Field::new("street", DataType::Utf8, false), + Field::new("zip", DataType::UInt16, false), + ]), + false, + ); + + assert_eq!(expected, dt); + } + + #[test] + fn parse_map_from_json() { + let json = r#" + { + "name": "my_map", + "nullable": false, + "type": { + "name": "map", + "keysSorted": true + }, + "children": [ + { + "name": "my_entries", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [ + { + "name": "my_keys", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "my_values", + "nullable": true, + "type": { + "name": "int", + "bitWidth": 16, + "isSigned": false + }, + "children": [] + } + ] + } + ] + } + "#; + let value: Value = serde_json::from_str(json).unwrap(); + let dt = field_from_json(&value).unwrap(); + + let expected = Field::new( + "my_map", + DataType::Map( + Box::new(Field::new( + "my_entries", + DataType::Struct(vec![ + Field::new("my_keys", DataType::Utf8, false), + Field::new("my_values", DataType::UInt16, true), + ]), + false, + )), + true, + ), + false, + ); + + assert_eq!(expected, dt); + } + + #[test] + fn parse_union_from_json() { + let json = r#" + { + "name": "my_union", + "nullable": false, + "type": { + "name": "union", + "mode": "SPARSE", + "typeIds": [ + 5, + 7 + ] + }, + "children": [ + { + "name": "f1", + "type": { + "name": "int", + "isSigned": true, + "bitWidth": 32 + }, + "nullable": true, + "children": [] + }, + { + "name": "f2", + "type": { + "name": "utf8" + }, + "nullable": true, + "children": [] + } + ] + } + "#; + let value: Value = serde_json::from_str(json).unwrap(); + let dt = field_from_json(&value).unwrap(); + + let expected = Field::new( + "my_union", + DataType::Union( + vec![ + Field::new("f1", DataType::Int32, true), + Field::new("f2", DataType::Utf8, true), + ], + vec![5, 7], + UnionMode::Sparse, + ), + false, + ); + + assert_eq!(expected, dt); + } +} diff --git a/integration-testing/src/util.rs b/integration-testing/src/util/mod.rs similarity index 99% rename from integration-testing/src/util.rs rename to integration-testing/src/util/mod.rs index 3a127ea35d48..9ecd301360fe 100644 --- a/integration-testing/src/util.rs +++ b/integration-testing/src/util/mod.rs @@ -36,7 +36,17 @@ use arrow::record_batch::{RecordBatch, RecordBatchReader}; use arrow::util::bit_util; use arrow::util::decimal::Decimal256; +mod datatype; +mod field; +mod schema; + +use crate::util::datatype::data_type_to_json; +use crate::util::field::field_from_json; +pub use schema::*; + /// A struct that represents an Arrow file with a schema and record batches +/// +/// See #[derive(Deserialize, Serialize, Debug)] pub struct ArrowJson { pub schema: ArrowJsonSchema, @@ -90,7 +100,7 @@ impl From<&Field> for ArrowJsonField { Self { name: field.name().to_string(), - field_type: field.data_type().to_json(), + field_type: data_type_to_json(field.data_type()), nullable: field.is_nullable(), children: vec![], dictionary: None, // TODO: not enough info @@ -255,9 +265,8 @@ impl ArrowJsonField { /// TODO: convert to use an Into fn to_arrow_field(&self) -> Result { // a bit regressive, but we have to convert the field to JSON in order to convert it - let field = serde_json::to_value(self) - .map_err(|error| ArrowError::JsonError(error.to_string()))?; - Field::from(&field) + let field = serde_json::to_value(self)?; + field_from_json(&field) } } diff --git a/integration-testing/src/util/schema.rs b/integration-testing/src/util/schema.rs new file mode 100644 index 000000000000..7e3475e6f460 --- /dev/null +++ b/integration-testing/src/util/schema.rs @@ -0,0 +1,733 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::util::field::{field_from_json, field_to_json}; +use arrow::datatypes::Schema; +use arrow::error::{ArrowError, Result}; +use std::collections::HashMap; + +/// Generate a JSON representation of the `Schema`. +pub fn schema_to_json(schema: &Schema) -> serde_json::Value { + serde_json::json!({ + "fields": schema.fields().iter().map(field_to_json).collect::>(), + "metadata": serde_json::to_value(schema.metadata()).unwrap() + }) +} + +/// Parse a `Schema` definition from a JSON representation. +pub fn schema_from_json(json: &serde_json::Value) -> Result { + use serde_json::Value; + match *json { + Value::Object(ref schema) => { + let fields = if let Some(Value::Array(fields)) = schema.get("fields") { + fields.iter().map(field_from_json).collect::>()? + } else { + return Err(ArrowError::ParseError( + "Schema fields should be an array".to_string(), + )); + }; + + let metadata = if let Some(value) = schema.get("metadata") { + from_metadata(value)? + } else { + HashMap::default() + }; + + Ok(Schema::new_with_metadata(fields, metadata)) + } + _ => Err(ArrowError::ParseError( + "Invalid json value type for schema".to_string(), + )), + } +} + +/// Parse a `metadata` definition from a JSON representation. +/// The JSON can either be an Object or an Array of Objects. +fn from_metadata(json: &serde_json::Value) -> Result> { + use serde_json::Value; + match json { + Value::Array(_) => { + let mut hashmap = HashMap::new(); + let values: Vec = serde_json::from_value(json.clone()) + .map_err(|_| { + ArrowError::JsonError( + "Unable to parse object into key-value pair".to_string(), + ) + })?; + for meta in values { + hashmap.insert(meta.key.clone(), meta.value); + } + Ok(hashmap) + } + Value::Object(md) => md + .iter() + .map(|(k, v)| { + if let Value::String(v) = v { + Ok((k.to_string(), v.to_string())) + } else { + Err(ArrowError::ParseError( + "metadata `value` field must be a string".to_string(), + )) + } + }) + .collect::>(), + _ => Err(ArrowError::ParseError( + "`metadata` field must be an object".to_string(), + )), + } +} + +#[derive(serde::Deserialize)] +struct MetadataKeyValue { + key: String, + value: String, +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit}; + use serde_json::Value; + + #[test] + fn schema_json() { + // Add some custom metadata + let metadata: HashMap = + [("Key".to_string(), "Value".to_string())] + .iter() + .cloned() + .collect(); + + let schema = Schema::new_with_metadata( + vec![ + Field::new("c1", DataType::Utf8, false), + Field::new("c2", DataType::Binary, false), + Field::new("c3", DataType::FixedSizeBinary(3), false), + Field::new("c4", DataType::Boolean, false), + Field::new("c5", DataType::Date32, false), + Field::new("c6", DataType::Date64, false), + Field::new("c7", DataType::Time32(TimeUnit::Second), false), + Field::new("c8", DataType::Time32(TimeUnit::Millisecond), false), + Field::new("c9", DataType::Time32(TimeUnit::Microsecond), false), + Field::new("c10", DataType::Time32(TimeUnit::Nanosecond), false), + Field::new("c11", DataType::Time64(TimeUnit::Second), false), + Field::new("c12", DataType::Time64(TimeUnit::Millisecond), false), + Field::new("c13", DataType::Time64(TimeUnit::Microsecond), false), + Field::new("c14", DataType::Time64(TimeUnit::Nanosecond), false), + Field::new("c15", DataType::Timestamp(TimeUnit::Second, None), false), + Field::new( + "c16", + DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".to_string())), + false, + ), + Field::new( + "c17", + DataType::Timestamp( + TimeUnit::Microsecond, + Some("Africa/Johannesburg".to_string()), + ), + false, + ), + Field::new( + "c18", + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ), + Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false), + Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), + Field::new("c21", DataType::Interval(IntervalUnit::MonthDayNano), false), + Field::new( + "c22", + DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), + false, + ), + Field::new( + "c23", + DataType::FixedSizeList( + Box::new(Field::new("bools", DataType::Boolean, false)), + 5, + ), + false, + ), + Field::new( + "c24", + DataType::List(Box::new(Field::new( + "inner_list", + DataType::List(Box::new(Field::new( + "struct", + DataType::Struct(vec![]), + true, + ))), + false, + ))), + true, + ), + Field::new( + "c25", + DataType::Struct(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::UInt16, false), + ]), + false, + ), + Field::new("c26", DataType::Interval(IntervalUnit::YearMonth), true), + Field::new("c27", DataType::Interval(IntervalUnit::DayTime), true), + Field::new("c28", DataType::Interval(IntervalUnit::MonthDayNano), true), + Field::new("c29", DataType::Duration(TimeUnit::Second), false), + Field::new("c30", DataType::Duration(TimeUnit::Millisecond), false), + Field::new("c31", DataType::Duration(TimeUnit::Microsecond), false), + Field::new("c32", DataType::Duration(TimeUnit::Nanosecond), false), + Field::new_dict( + "c33", + DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Utf8), + ), + true, + 123, + true, + ), + Field::new("c34", DataType::LargeBinary, true), + Field::new("c35", DataType::LargeUtf8, true), + Field::new( + "c36", + DataType::LargeList(Box::new(Field::new( + "inner_large_list", + DataType::LargeList(Box::new(Field::new( + "struct", + DataType::Struct(vec![]), + false, + ))), + true, + ))), + true, + ), + Field::new( + "c37", + DataType::Map( + Box::new(Field::new( + "my_entries", + DataType::Struct(vec![ + Field::new("my_keys", DataType::Utf8, false), + Field::new("my_values", DataType::UInt16, true), + ]), + false, + )), + true, + ), + false, + ), + ], + metadata, + ); + + let expected = schema_to_json(&schema); + let json = r#"{ + "fields": [ + { + "name": "c1", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "c2", + "nullable": false, + "type": { + "name": "binary" + }, + "children": [] + }, + { + "name": "c3", + "nullable": false, + "type": { + "name": "fixedsizebinary", + "byteWidth": 3 + }, + "children": [] + }, + { + "name": "c4", + "nullable": false, + "type": { + "name": "bool" + }, + "children": [] + }, + { + "name": "c5", + "nullable": false, + "type": { + "name": "date", + "unit": "DAY" + }, + "children": [] + }, + { + "name": "c6", + "nullable": false, + "type": { + "name": "date", + "unit": "MILLISECOND" + }, + "children": [] + }, + { + "name": "c7", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 32, + "unit": "SECOND" + }, + "children": [] + }, + { + "name": "c8", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 32, + "unit": "MILLISECOND" + }, + "children": [] + }, + { + "name": "c9", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 32, + "unit": "MICROSECOND" + }, + "children": [] + }, + { + "name": "c10", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 32, + "unit": "NANOSECOND" + }, + "children": [] + }, + { + "name": "c11", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 64, + "unit": "SECOND" + }, + "children": [] + }, + { + "name": "c12", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 64, + "unit": "MILLISECOND" + }, + "children": [] + }, + { + "name": "c13", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 64, + "unit": "MICROSECOND" + }, + "children": [] + }, + { + "name": "c14", + "nullable": false, + "type": { + "name": "time", + "bitWidth": 64, + "unit": "NANOSECOND" + }, + "children": [] + }, + { + "name": "c15", + "nullable": false, + "type": { + "name": "timestamp", + "unit": "SECOND" + }, + "children": [] + }, + { + "name": "c16", + "nullable": false, + "type": { + "name": "timestamp", + "unit": "MILLISECOND", + "timezone": "UTC" + }, + "children": [] + }, + { + "name": "c17", + "nullable": false, + "type": { + "name": "timestamp", + "unit": "MICROSECOND", + "timezone": "Africa/Johannesburg" + }, + "children": [] + }, + { + "name": "c18", + "nullable": false, + "type": { + "name": "timestamp", + "unit": "NANOSECOND" + }, + "children": [] + }, + { + "name": "c19", + "nullable": false, + "type": { + "name": "interval", + "unit": "DAY_TIME" + }, + "children": [] + }, + { + "name": "c20", + "nullable": false, + "type": { + "name": "interval", + "unit": "YEAR_MONTH" + }, + "children": [] + }, + { + "name": "c21", + "nullable": false, + "type": { + "name": "interval", + "unit": "MONTH_DAY_NANO" + }, + "children": [] + }, + { + "name": "c22", + "nullable": false, + "type": { + "name": "list" + }, + "children": [ + { + "name": "item", + "nullable": true, + "type": { + "name": "bool" + }, + "children": [] + } + ] + }, + { + "name": "c23", + "nullable": false, + "type": { + "name": "fixedsizelist", + "listSize": 5 + }, + "children": [ + { + "name": "bools", + "nullable": false, + "type": { + "name": "bool" + }, + "children": [] + } + ] + }, + { + "name": "c24", + "nullable": true, + "type": { + "name": "list" + }, + "children": [ + { + "name": "inner_list", + "nullable": false, + "type": { + "name": "list" + }, + "children": [ + { + "name": "struct", + "nullable": true, + "type": { + "name": "struct" + }, + "children": [] + } + ] + } + ] + }, + { + "name": "c25", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [ + { + "name": "a", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "b", + "nullable": false, + "type": { + "name": "int", + "bitWidth": 16, + "isSigned": false + }, + "children": [] + } + ] + }, + { + "name": "c26", + "nullable": true, + "type": { + "name": "interval", + "unit": "YEAR_MONTH" + }, + "children": [] + }, + { + "name": "c27", + "nullable": true, + "type": { + "name": "interval", + "unit": "DAY_TIME" + }, + "children": [] + }, + { + "name": "c28", + "nullable": true, + "type": { + "name": "interval", + "unit": "MONTH_DAY_NANO" + }, + "children": [] + }, + { + "name": "c29", + "nullable": false, + "type": { + "name": "duration", + "unit": "SECOND" + }, + "children": [] + }, + { + "name": "c30", + "nullable": false, + "type": { + "name": "duration", + "unit": "MILLISECOND" + }, + "children": [] + }, + { + "name": "c31", + "nullable": false, + "type": { + "name": "duration", + "unit": "MICROSECOND" + }, + "children": [] + }, + { + "name": "c32", + "nullable": false, + "type": { + "name": "duration", + "unit": "NANOSECOND" + }, + "children": [] + }, + { + "name": "c33", + "nullable": true, + "children": [], + "type": { + "name": "utf8" + }, + "dictionary": { + "id": 123, + "indexType": { + "name": "int", + "bitWidth": 32, + "isSigned": true + }, + "isOrdered": true + } + }, + { + "name": "c34", + "nullable": true, + "type": { + "name": "largebinary" + }, + "children": [] + }, + { + "name": "c35", + "nullable": true, + "type": { + "name": "largeutf8" + }, + "children": [] + }, + { + "name": "c36", + "nullable": true, + "type": { + "name": "largelist" + }, + "children": [ + { + "name": "inner_large_list", + "nullable": true, + "type": { + "name": "largelist" + }, + "children": [ + { + "name": "struct", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [] + } + ] + } + ] + }, + { + "name": "c37", + "nullable": false, + "type": { + "name": "map", + "keysSorted": true + }, + "children": [ + { + "name": "my_entries", + "nullable": false, + "type": { + "name": "struct" + }, + "children": [ + { + "name": "my_keys", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + }, + { + "name": "my_values", + "nullable": true, + "type": { + "name": "int", + "bitWidth": 16, + "isSigned": false + }, + "children": [] + } + ] + } + ] + } + ], + "metadata" : { + "Key": "Value" + } + }"#; + let value: Value = serde_json::from_str(json).unwrap(); + assert_eq!(expected, value); + + // convert back to a schema + let value: Value = serde_json::from_str(json).unwrap(); + let schema2 = schema_from_json(&value).unwrap(); + + assert_eq!(schema, schema2); + + // Check that empty metadata produces empty value in JSON and can be parsed + let json = r#"{ + "fields": [ + { + "name": "c1", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + } + ], + "metadata": {} + }"#; + let value: Value = serde_json::from_str(json).unwrap(); + let schema = schema_from_json(&value).unwrap(); + assert!(schema.metadata.is_empty()); + + // Check that metadata field is not required in the JSON. + let json = r#"{ + "fields": [ + { + "name": "c1", + "nullable": false, + "type": { + "name": "utf8" + }, + "children": [] + } + ] + }"#; + let value: Value = serde_json::from_str(json).unwrap(); + let schema = schema_from_json(&value).unwrap(); + assert!(schema.metadata.is_empty()); + } +}