Skip to content

Commit

Permalink
Add ByteArray constructors (apache#3879)
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold committed Apr 13, 2023
1 parent f0a5e43 commit 7728f2a
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 27 deletions.
65 changes: 63 additions & 2 deletions arrow-array/src/array/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ use crate::iterator::ArrayIter;
use crate::types::bytes::ByteArrayNativeType;
use crate::types::ByteArrayType;
use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait};
use arrow_buffer::{ArrowNativeType, Buffer};
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
use arrow_buffer::{NullBuffer, OffsetBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::DataType;
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::sync::Arc;

Expand Down Expand Up @@ -60,6 +60,67 @@ impl<T: ByteArrayType> GenericByteArray<T> {
/// Data type of the array.
pub const DATA_TYPE: DataType = T::DATA_TYPE;

/// Create a new [`GenericByteArray`] from the provided parts, panicking on failure
///
/// # Panics
///
/// Panics if [`GenericByteArray::try_new`] returns an error
pub fn new(
offsets: OffsetBuffer<T::Offset>,
values: Buffer,
nulls: Option<NullBuffer>,
) -> Self {
Self::try_new(offsets, values, nulls).unwrap()
}

/// Create a new [`GenericByteArray`] from the provided parts, returning an error on failure
///
/// # Errors
///
/// * `offsets.len() - 1 != nulls.len()`
/// * [`T::validate`] returns an error
pub fn try_new(
offsets: OffsetBuffer<T::Offset>,
values: Buffer,
nulls: Option<NullBuffer>,
) -> Result<Self, ArrowError> {
let len = offsets.len() - 1;
T::validate(&offsets, &values)?;

if let Some(n) = nulls.as_ref() {
if n.len() != len {
return Err(ArrowError::InvalidArgumentError(format!(
"Incorrect number of nulls for {}{}Array, expected {len} got {}",
T::Offset::PREFIX,
T::PREFIX,
n.len(),
)));
}
}

Ok(Self {
data_type: T::DATA_TYPE,
value_offsets: offsets,
value_data: values,
nulls,
})
}

/// Create a new [`GenericByteArray`] of length `len` where all values are null
pub fn new_null(len: usize) -> Self {
Self {
data_type: T::DATA_TYPE,
value_offsets: OffsetBuffer::new_zeroed(len),
value_data: MutableBuffer::new(0).into(),
nulls: Some(NullBuffer::new_null(len)),
}
}

/// Deconstruct this array into its constituent parts
pub fn into_parts(self) -> (OffsetBuffer<T::Offset>, Buffer, Option<NullBuffer>) {
(self.value_offsets, self.value_data, self.nulls)
}

/// Returns the length for value at index `i`.
/// # Panics
/// Panics if index `i` is out of bounds.
Expand Down
28 changes: 4 additions & 24 deletions arrow-array/src/array/string_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@
// under the License.

use crate::types::GenericStringType;
use crate::{
Array, GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait,
};
use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait};
use arrow_buffer::{bit_util, MutableBuffer};
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};
Expand Down Expand Up @@ -105,27 +103,8 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
pub fn try_from_binary(
v: GenericBinaryArray<OffsetSize>,
) -> Result<Self, ArrowError> {
let offsets = v.value_offsets();
let values = v.value_data();

// We only need to validate that all values are valid UTF-8
let validated = std::str::from_utf8(values).map_err(|e| {
ArrowError::CastError(format!("Encountered non UTF-8 data: {e}"))
})?;

for offset in offsets.iter() {
let o = offset.as_usize();
if !validated.is_char_boundary(o) {
return Err(ArrowError::CastError(format!(
"Split UTF-8 codepoint at offset {o}"
)));
}
}

let builder = v.into_data().into_builder().data_type(Self::DATA_TYPE);
// SAFETY:
// Validated UTF-8 above
Ok(Self::from(unsafe { builder.build_unchecked() }))
let (offsets, values, nulls) = v.into_parts();
Self::try_new(offsets, values, nulls)
}
}

Expand Down Expand Up @@ -261,6 +240,7 @@ mod tests {
use super::*;
use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder};
use crate::types::UInt8Type;
use crate::Array;
use arrow_buffer::Buffer;
use arrow_schema::Field;
use std::sync::Arc;
Expand Down
46 changes: 45 additions & 1 deletion arrow-array/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
use crate::delta::shift_months;
use crate::{ArrowNativeTypeOp, OffsetSizeTrait};
use arrow_buffer::i256;
use arrow_buffer::{i256, Buffer, OffsetBuffer};
use arrow_data::decimal::{validate_decimal256_precision, validate_decimal_precision};
use arrow_schema::{
ArrowError, DataType, IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION,
Expand Down Expand Up @@ -882,10 +882,18 @@ pub trait ByteArrayType: 'static + Send + Sync + bytes::ByteArrayTypeSealed {
/// Utf8Array will have native type has &str
/// BinaryArray will have type as [u8]
type Native: bytes::ByteArrayNativeType + AsRef<Self::Native> + AsRef<[u8]> + ?Sized;

/// "Binary" or "String", for use in error messages
const PREFIX: &'static str;

/// Datatype of array elements
const DATA_TYPE: DataType;

/// Verifies that every pair of `offsets` denotes a valid slice `values`
fn validate(
offsets: &OffsetBuffer<Self::Offset>,
values: &Buffer,
) -> Result<(), ArrowError>;
}

/// [`ByteArrayType`] for string arrays
Expand All @@ -903,6 +911,27 @@ impl<O: OffsetSizeTrait> ByteArrayType for GenericStringType<O> {
} else {
DataType::Utf8
};

fn validate(
offsets: &OffsetBuffer<Self::Offset>,
values: &Buffer,
) -> Result<(), ArrowError> {
// Verify that the slice as a whole is valid UTF-8
let validated = std::str::from_utf8(values).map_err(|e| {
ArrowError::InvalidArgumentError(format!("Encountered non UTF-8 data: {e}"))
})?;

// Verify each offset is at a valid character boundary in this UTF-8 array
for offset in offsets.iter() {
let o = offset.as_usize();
if !validated.is_char_boundary(o) {
return Err(ArrowError::InvalidArgumentError(format!(
"Split UTF-8 codepoint at offset {o}"
)));
}
}
Ok(())
}
}

/// An arrow utf8 array with i32 offsets
Expand All @@ -925,6 +954,21 @@ impl<O: OffsetSizeTrait> ByteArrayType for GenericBinaryType<O> {
} else {
DataType::Binary
};

fn validate(
offsets: &OffsetBuffer<Self::Offset>,
values: &Buffer,
) -> Result<(), ArrowError> {
// offsets are guaranteed to be monotonically increasing and non-empty
let max_offset = offsets.last().unwrap().as_usize();
if values.len() < max_offset {
return Err(ArrowError::InvalidArgumentError(format!(
"Maximum offset of {max_offset} is larger than values of length {}",
values.len()
)));
}
Ok(())
}
}

/// An arrow binary array with i32 offsets
Expand Down

0 comments on commit 7728f2a

Please sign in to comment.