Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Added support for DataType::Map and MapArray #464

Merged
merged 3 commits into from
Oct 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ we also use the `0.x.y` versioning, since we are iterating over the API.
## Features in the original not available in this crate

* Parquet read and write of struct and nested lists.
* Map type

## Features in this crate not in pyarrow

Expand All @@ -86,7 +85,7 @@ we also use the `0.x.y` versioning, since we are iterating over the API.

## Features in pyarrow not in this crate

Too many to enumerate; e.g. nested dictionary arrays, map, nested parquet.
Too many to enumerate; e.g. nested dictionary arrays, nested parquet.

## FAQ

Expand Down
17 changes: 17 additions & 0 deletions arrow-pyarrow-integration-testing/tests/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,23 @@ def test_dict(self):
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_map(self):
"""
Python -> Rust -> Python
"""
offsets = [0, None, 2, 6]
pykeys = [b"a", b"b", b"c", b"d", b"e", b"f"]
pyitems = [1, 2, 3, None, 4, 5]
keys = pyarrow.array(pykeys, type="binary")
items = pyarrow.array(pyitems, type="i4")

a = pyarrow.MapArray.from_arrays(offsets, keys, items)
b = arrow_pyarrow_integration_testing.round_trip_array(a)

b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_sparse_union(self):
"""
Python -> Rust -> Python
Expand Down
23 changes: 19 additions & 4 deletions integration-testing/unskip.patch
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py
index d0c4b3d6c..936351c80 100644
index d0c4b3d6c..0ce29fb8a 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -1568,8 +1568,7 @@ def get_generated_json_files(tempdir=None):
Expand All @@ -12,7 +12,7 @@ index d0c4b3d6c..936351c80 100644

generate_decimal256_case()
.skip_category('Go') # TODO(ARROW-7948): Decimal + Go
@@ -1579,13 +1578,11 @@ def get_generated_json_files(tempdir=None):
@@ -1579,17 +1578,14 @@ def get_generated_json_files(tempdir=None):
generate_datetime_case(),

generate_interval_case()
Expand All @@ -27,8 +27,13 @@ index d0c4b3d6c..936351c80 100644
+ .skip_category('JS'),


generate_map_case()
@@ -1603,13 +1600,11 @@ def get_generated_json_files(tempdir=None):
- generate_map_case()
- .skip_category('Rust'),
+ generate_map_case(),

generate_non_canonical_map_case()
.skip_category('Java') # TODO(ARROW-8715)
@@ -1603,13 +1599,11 @@ def get_generated_json_files(tempdir=None):

generate_nested_large_offsets_case()
.skip_category('Go')
Expand All @@ -44,3 +49,13 @@ index d0c4b3d6c..936351c80 100644

generate_custom_metadata_case()
.skip_category('JS'),
@@ -1634,8 +1628,7 @@ def get_generated_json_files(tempdir=None):

generate_extension_case()
.skip_category('Go') # TODO(ARROW-3039): requires dictionaries
- .skip_category('JS')
- .skip_category('Rust'),
+ .skip_category('JS'),
]

generated_paths = []
1 change: 1 addition & 0 deletions src/array/display.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ pub fn get_value_display<'a>(array: &'a dyn Array) -> Box<dyn Fn(usize) -> Strin
DataType::UInt64 => dyn_dict!(array, u64),
_ => unreachable!(),
},
Map(_, _) => todo!(),
Struct(_) => {
let a = array.as_any().downcast_ref::<StructArray>().unwrap();
let displays = a
Expand Down
5 changes: 5 additions & 0 deletions src/array/equal/map.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
use crate::array::{Array, MapArray};

pub(super) fn equal(lhs: &MapArray, rhs: &MapArray) -> bool {
lhs.data_type() == rhs.data_type() && lhs.len() == rhs.len() && lhs.iter().eq(rhs.iter())
}
6 changes: 6 additions & 0 deletions src/array/equal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ mod dictionary;
mod fixed_size_binary;
mod fixed_size_list;
mod list;
mod map;
mod null;
mod primitive;
mod struct_;
Expand Down Expand Up @@ -235,5 +236,10 @@ pub fn equal(lhs: &dyn Array, rhs: &dyn Array) -> bool {
let rhs = rhs.as_any().downcast_ref().unwrap();
union::equal(lhs, rhs)
}
Map => {
let lhs = lhs.as_any().downcast_ref().unwrap();
let rhs = rhs.as_any().downcast_ref().unwrap();
map::equal(lhs, rhs)
}
}
}
1 change: 1 addition & 0 deletions src/array/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ pub fn buffers_children_dictionary(array: &dyn Array) -> BuffersChildren {
FixedSizeList => ffi_dyn!(array, FixedSizeListArray),
Struct => ffi_dyn!(array, StructArray),
Union => ffi_dyn!(array, UnionArray),
Map => ffi_dyn!(array, MapArray),
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
let array = array.as_any().downcast_ref::<DictionaryArray<$T>>().unwrap();
Expand Down
2 changes: 1 addition & 1 deletion src/array/growable/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ pub fn make_growable<'a>(
use_validity,
capacity
),
Union => todo!(),
Union | Map => todo!(),
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
dyn_dict_growable!($T, arrays, use_validity, capacity)
Expand Down
21 changes: 8 additions & 13 deletions src/array/list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,19 +129,14 @@ impl<O: Offset> ListArray<O> {
/// Returns the element at index `i`
#[inline]
pub fn value(&self, i: usize) -> Box<dyn Array> {
if self.is_null(i) {
jorgecarleitao marked this conversation as resolved.
Show resolved Hide resolved
new_empty_array(self.values.data_type().clone())
} else {
let offsets = self.offsets.as_slice();
let offset = offsets[i];
let offset_1 = offsets[i + 1];
let length = (offset_1 - offset).to_usize();

// Safety:
// One of the invariants of the struct
// is that offsets are in bounds
unsafe { self.values.slice_unchecked(offset.to_usize(), length) }
}
let offset = self.offsets[i];
let offset_1 = self.offsets[i + 1];
let length = (offset_1 - offset).to_usize();

// Safety:
// One of the invariants of the struct
// is that offsets are in bounds
unsafe { self.values.slice_unchecked(offset.to_usize(), length) }
}

/// Returns the element at index `i` as &str
Expand Down
41 changes: 41 additions & 0 deletions src/array/map/ffi.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
use std::sync::Arc;

use crate::{array::FromFfi, error::Result, ffi};

use super::super::{ffi::ToFfi, Array};
use super::MapArray;

unsafe impl ToFfi for MapArray {
fn buffers(&self) -> Vec<Option<std::ptr::NonNull<u8>>> {
vec![
self.validity.as_ref().map(|x| x.as_ptr()),
std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8),
]
}

fn offset(&self) -> usize {
self.offset
}

fn children(&self) -> Vec<Arc<dyn Array>> {
vec![self.field.clone()]
}
}

impl<A: ffi::ArrowArrayRef> FromFfi<A> for MapArray {
unsafe fn try_from_ffi(array: A) -> Result<Self> {
let data_type = array.field().data_type().clone();
let length = array.array().len();
let offset = array.array().offset();
let mut validity = unsafe { array.validity() }?;
let mut offsets = unsafe { array.buffer::<i32>(0) }?;
let child = array.child(0)?;
let values = ffi::try_from(child)?.into();

if offset > 0 {
offsets = offsets.slice(offset, length);
validity = validity.map(|x| x.slice(offset, length))
}
Ok(Self::from_data(data_type, offsets, values, validity))
}
}
85 changes: 85 additions & 0 deletions src/array/map/iterator.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
use crate::array::Array;
use crate::bitmap::utils::{zip_validity, ZipValidity};
use crate::trusted_len::TrustedLen;

use super::MapArray;

/// Iterator of values of an [`ListArray`].
#[derive(Clone, Debug)]
pub struct MapValuesIter<'a> {
array: &'a MapArray,
index: usize,
end: usize,
}

impl<'a> MapValuesIter<'a> {
#[inline]
pub fn new(array: &'a MapArray) -> Self {
Self {
array,
index: 0,
end: array.len(),
}
}
}

impl<'a> Iterator for MapValuesIter<'a> {
type Item = Box<dyn Array>;

#[inline]
fn next(&mut self) -> Option<Self::Item> {
if self.index == self.end {
return None;
}
let old = self.index;
self.index += 1;
// Safety:
// self.end is maximized by the length of the array
Some(unsafe { self.array.value_unchecked(old) })
}

#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
(self.end - self.index, Some(self.end - self.index))
}
}

unsafe impl<'a> TrustedLen for MapValuesIter<'a> {}

impl<'a> DoubleEndedIterator for MapValuesIter<'a> {
#[inline]
fn next_back(&mut self) -> Option<Self::Item> {
if self.index == self.end {
None
} else {
self.end -= 1;
// Safety:
// self.end is maximized by the length of the array
Some(unsafe { self.array.value_unchecked(self.end) })
}
}
}

impl<'a> IntoIterator for &'a MapArray {
type Item = Option<Box<dyn Array>>;
type IntoIter = ZipValidity<'a, Box<dyn Array>, MapValuesIter<'a>>;

fn into_iter(self) -> Self::IntoIter {
self.iter()
}
}

impl<'a> MapArray {
/// Returns an iterator of `Option<Box<dyn Array>>`
pub fn iter(&'a self) -> ZipValidity<'a, Box<dyn Array>, MapValuesIter<'a>> {
zip_validity(
MapValuesIter::new(self),
self.validity.as_ref().map(|x| x.iter()),
)
}

/// Returns an iterator of `Box<dyn Array>`
pub fn values_iter(&'a self) -> MapValuesIter<'a> {
MapValuesIter::new(self)
}
}
Loading