Skip to content

Commit

Permalink
Merge branch 'master' into run-end-filter-safety
Browse files Browse the repository at this point in the history
  • Loading branch information
delamarch3 authored Nov 5, 2024
2 parents 20e6eeb + 350ea26 commit a0eb724
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 13 deletions.
2 changes: 1 addition & 1 deletion arrow-array/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ chrono = { workspace = true }
chrono-tz = { version = "0.10", optional = true }
num = { version = "0.4.1", default-features = false, features = ["std"] }
half = { version = "2.1", default-features = false, features = ["num-traits"] }
hashbrown = { version = "0.14.2", default-features = false }
hashbrown = { version = "0.15.1", default-features = false }

[features]
ffi = ["arrow-schema/ffi", "arrow-data/ffi"]
Expand Down
79 changes: 70 additions & 9 deletions arrow-array/src/array/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,18 @@ use std::sync::Arc;

use super::ByteArrayType;

/// [Variable-size Binary View Layout]: An array of variable length bytes view arrays.
/// [Variable-size Binary View Layout]: An array of variable length bytes views.
///
/// This array type is used to store variable length byte data (e.g. Strings, Binary)
/// and has efficient operations such as `take`, `filter`, and comparison.
///
/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
///
/// This is different from [`GenericByteArray`] as it stores both an offset and
/// length meaning that take / filter operations can be implemented without
/// copying the underlying data. In addition, it stores an inlined prefix which
/// can be used to speed up comparisons.
/// This is different from [`GenericByteArray`], which also stores variable
/// length byte data, as it represents strings with an offset and length. `take`
/// and `filter` like operations are implemented by manipulating the "views"
/// (`u128`) without modifying the bytes. Each view also stores an inlined
/// prefix which speed up comparisons.
///
/// # See Also
///
Expand All @@ -50,11 +54,18 @@ use super::ByteArrayType;
///
/// [`ByteView`]: arrow_data::ByteView
///
/// # Notes
/// # Use the [`eq`] kernel to compare the logical content.
///
/// Comparing two `GenericByteViewArray` using PartialEq compares by structure
/// (the `u128`s) and contents of the buffers, not by logical content. As there
/// are many different buffer layouts to represent the same data (e.g. different
/// offsets, different buffer sizes, etc) two arrays with the same data may not
/// compare equal.
///
/// To compare the logical content of two `GenericByteViewArray`s, use the [`eq`]
/// kernel.
///
/// Comparing two `GenericByteViewArray` using PartialEq compares by structure,
/// not by value. as there are many different buffer layouts to represent the
/// same data (e.g. different offsets, different buffer sizes, etc).
/// [`eq`]: https://docs.rs/arrow/latest/arrow/compute/kernels/cmp/fn.eq.html
///
/// # Layout: "views" and buffers
///
Expand Down Expand Up @@ -86,6 +97,52 @@ use super::ByteArrayType;
/// view and the entire string is stored in one of the buffers. See [`ByteView`]
/// to access the fields of the these views.
///
/// As with other arrays, the optimized kernels in [`arrow_compute`] are likely
/// the easiest and fastest way to work with this data. However, it is possible
/// to access the views and buffers directly for more control.
///
/// For example
///
/// ```rust
/// # use arrow_array::StringViewArray;
/// # use arrow_array::Array;
/// use arrow_data::ByteView;
/// let array = StringViewArray::from(vec![
/// "hello",
/// "this string is longer than 12 bytes",
/// "this string is also longer than 12 bytes"
/// ]);
///
/// // ** Examine the first view (short string) **
/// assert!(array.is_valid(0)); // Check for nulls
/// let short_view: u128 = array.views()[0]; // "hello"
/// // get length of the string
/// let len = short_view as u32;
/// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view
/// // SAFETY: `view` is a valid view
/// let value = unsafe {
/// StringViewArray::inline_value(&short_view, len as usize)
/// };
/// assert_eq!(value, b"hello");
///
/// // ** Examine the third view (long string) **
/// assert!(array.is_valid(12)); // Check for nulls
/// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes"
/// let len = long_view as u32;
/// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer
/// let view = ByteView::from(long_view); // use ByteView to access the fields
/// assert_eq!(view.length, 40);
/// assert_eq!(view.buffer_index, 0);
/// assert_eq!(view.offset, 35); // data starts after the first long string
/// // Views for long strings store a 4 byte prefix
/// let prefix = view.prefix.to_le_bytes();
/// assert_eq!(&prefix, b"this");
/// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly)
/// assert_eq!(value, "this string is also longer than 12 bytes");
/// ```
///
/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
///
/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
/// than they must point into a valid buffer. However, they can be out of order,
/// non continuous and overlapping.
Expand Down Expand Up @@ -694,6 +751,8 @@ where

/// A [`GenericByteViewArray`] of `[u8]`
///
/// See [`GenericByteViewArray`] for format and layout details.
///
/// # Example
/// ```
/// use arrow_array::BinaryViewArray;
Expand Down Expand Up @@ -733,6 +792,8 @@ impl From<Vec<Option<&[u8]>>> for BinaryViewArray {

/// A [`GenericByteViewArray`] that stores utf8 data
///
/// See [`GenericByteViewArray`] for format and layout details.
///
/// # Example
/// ```
/// use arrow_array::StringViewArray;
Expand Down
38 changes: 38 additions & 0 deletions arrow-string/src/length.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,15 @@ pub fn bit_length(array: &dyn Array) -> Result<ArrayRef, ArrowError> {
let list = array.as_string::<i64>();
Ok(bit_length_impl::<Int64Type>(list.offsets(), list.nulls()))
}
DataType::Utf8View => {
let list = array.as_string_view();
let values = list
.views()
.iter()
.map(|view| (*view as i32).wrapping_mul(8))
.collect();
Ok(Arc::new(Int32Array::new(values, array.nulls().cloned())))
}
DataType::Binary => {
let list = array.as_binary::<i32>();
Ok(bit_length_impl::<Int32Type>(list.offsets(), list.nulls()))
Expand Down Expand Up @@ -462,6 +471,35 @@ mod tests {
})
}

#[test]
fn bit_length_test_utf8view() {
bit_length_cases()
.into_iter()
.for_each(|(input, len, expected)| {
let string_array = StringViewArray::from(input);
let result = bit_length(&string_array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
expected.iter().enumerate().for_each(|(i, value)| {
assert_eq!(*value, result.value(i));
});
})
}

#[test]
fn bit_length_null_utf8view() {
bit_length_null_cases()
.into_iter()
.for_each(|(input, len, expected)| {
let array = StringArray::from(input);
let result = bit_length(&array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int32Array>().unwrap();

let expected: Int32Array = expected.into();
assert_eq!(&expected, result);
})
}
#[test]
fn bit_length_binary() {
let value: Vec<&[u8]> = vec![b"one", &[0xff, 0xf8], b"three"];
Expand Down
2 changes: 1 addition & 1 deletion object_store/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ walkdir = "2"
# Cloud storage support
base64 = { version = "0.22", default-features = false, features = ["std"], optional = true }
hyper = { version = "1.2", default-features = false, optional = true }
quick-xml = { version = "0.36.0", features = ["serialize", "overlapped-lists"], optional = true }
quick-xml = { version = "0.37.0", features = ["serialize", "overlapped-lists"], optional = true }
serde = { version = "1.0", default-features = false, features = ["derive"], optional = true }
serde_json = { version = "1.0", default-features = false, optional = true }
rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true }
Expand Down
5 changes: 4 additions & 1 deletion object_store/src/gcp/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ enum Error {
#[snafu(display("Error getting put response body: {}", source))]
PutResponseBody { source: reqwest::Error },

#[snafu(display("Got invalid put request: {}", source))]
InvalidPutRequest { source: quick_xml::se::SeError },

#[snafu(display("Got invalid put response: {}", source))]
InvalidPutResponse { source: quick_xml::de::DeError },

Expand Down Expand Up @@ -495,7 +498,7 @@ impl GoogleCloudStorageClient {
let credential = self.get_credential().await?;

let data = quick_xml::se::to_string(&upload_info)
.context(InvalidPutResponseSnafu)?
.context(InvalidPutRequestSnafu)?
// We cannot disable the escaping that transforms "/" to "&quote;" :(
// https://github.com/tafia/quick-xml/issues/362
// https://github.com/tafia/quick-xml/issues/350
Expand Down
2 changes: 1 addition & 1 deletion parquet/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ serde_json = { version = "1.0", default-features = false, features = ["std"], op
seq-macro = { version = "0.3", default-features = false }
futures = { version = "0.3", default-features = false, features = ["std"], optional = true }
tokio = { version = "1.0", optional = true, default-features = false, features = ["macros", "rt", "io-util"] }
hashbrown = { version = "0.14", default-features = false }
hashbrown = { version = "0.15", default-features = false }
twox-hash = { version = "1.6", default-features = false }
paste = { version = "1.0" }
half = { version = "2.1", default-features = false, features = ["num-traits"] }
Expand Down

0 comments on commit a0eb724

Please sign in to comment.