Skip to content
151 changes: 120 additions & 31 deletions arrow-array/src/array/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
/// little-endian `u128` representation, converts them to big-endian ordering, and packs them
/// into a single `u128` value suitable for fast, branchless comparisons.
///
/// ### Why include length?
/// # Why include length?
///
/// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes
/// compare equal—either because one is a true prefix of the other or because zero-padding
Expand All @@ -605,29 +605,85 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
/// key("bar\0") = 0x0000000000000000000062617200000004
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this diagram needs be updated as well

/// ⇒ key("bar") < key("bar\0")
/// ```
/// # Inlining and Endianness
///
/// - We start by calling `.to_le_bytes()` on the `raw` `u128`, because Rust’s native in‑memory
/// representation is little‑endian on x86/ARM.
/// - We extract the low 32 bits numerically (`raw as u32`)—this step is endianness‑free.
/// - We copy the 12 bytes of inline data (original order) into `buf[0..12]`.
/// - We serialize `length` as big‑endian into `buf[12..16]`.
/// - Finally, `u128::from_be_bytes(buf)` treats `buf[0]` as the most significant byte
/// and `buf[15]` as the least significant, producing a `u128` whose integer value
/// directly encodes “inline data then length” in big‑endian form.
///
/// This ensures that a simple `u128` comparison is equivalent to the desired
/// lexicographical comparison of the inline bytes followed by length.
#[inline(always)]
pub fn inline_key_fast(raw: u128) -> u128 {
// Convert the raw u128 (little-endian) into bytes for manipulation
// 1. Decompose `raw` into little‑endian bytes:
// - raw_bytes[0..4] = length in LE
// - raw_bytes[4..16] = inline string data
let raw_bytes = raw.to_le_bytes();

// Extract the length (first 4 bytes), convert to big-endian u32, and promote to u128
let len_le = &raw_bytes[0..4];
let len_be = u32::from_le_bytes(len_le.try_into().unwrap()).to_be() as u128;

// Extract the inline string bytes (next 12 bytes), place them into the lower 12 bytes of a 16-byte array,
// padding the upper 4 bytes with zero to form a little-endian u128 value
let mut inline_bytes = [0u8; 16];
inline_bytes[4..16].copy_from_slice(&raw_bytes[4..16]);

// Convert to big-endian to ensure correct lexical ordering
let inline_u128 = u128::from_le_bytes(inline_bytes).to_be();

// Shift right by 32 bits to discard the zero padding (upper 4 bytes),
// so that the inline string occupies the high 96 bits
let inline_part = inline_u128 >> 32;

// Combine the inline string part (high 96 bits) and length (low 32 bits) into the final key
(inline_part << 32) | len_be
// 2. Numerically truncate to get the low 32‑bit length (endianness‑free).
let length = raw as u32;

// 3. Build a 16‑byte buffer in big‑endian order:
// - buf[0..12] = inline string bytes (in original order)
// - buf[12..16] = length.to_be_bytes() (BE)
let mut buf = [0u8; 16];
buf[0..12].copy_from_slice(&raw_bytes[4..16]); // inline data

// Why convert length to big-endian for comparison?
//
// Rust (on most platforms) stores integers in little-endian format,
// meaning the least significant byte is at the lowest memory address.
// For example, an u32 value like 0x22345677 is stored in memory as:
//
// [0x77, 0x56, 0x34, 0x22] // little-endian layout
// ^ ^ ^ ^
// LSB ↑↑↑ MSB
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a very nice comment

//
// This layout is efficient for arithmetic but *not* suitable for
// lexicographic (dictionary-style) comparison of byte arrays.
//
// To compare values by byte order—e.g., for sorted keys or binary trees—
// we must convert them to **big-endian**, where:
//
// - The most significant byte (MSB) comes first (index 0)
// - The least significant byte (LSB) comes last (index N-1)
//
// In big-endian, the same u32 = 0x22345677 would be represented as:
//
// [0x22, 0x34, 0x56, 0x77]
//
// This ordering aligns with natural string/byte sorting, so calling
// `.to_be_bytes()` allows us to construct
// keys where standard numeric comparison (e.g., `<`, `>`) behaves
// like lexicographic byte comparison.
buf[12..16].copy_from_slice(&length.to_be_bytes()); // length in BE

// 4. Deserialize the buffer as a big‑endian u128:
// buf[0] is MSB, buf[15] is LSB.
// Details:
// Note on endianness and layout:
//
// Although `buf[0]` is stored at the lowest memory address,
// calling `u128::from_be_bytes(buf)` interprets it as the **most significant byte (MSB)**,
// and `buf[15]` as the **least significant byte (LSB)**.
//
// This is the core principle of **big-endian decoding**:
// - Byte at index 0 maps to bits 127..120 (highest)
// - Byte at index 1 maps to bits 119..112
// - ...
// - Byte at index 15 maps to bits 7..0 (lowest)
//
// So even though memory layout goes from low to high (left to right),
// big-endian treats the **first byte** as highest in value.
//
// This guarantees that comparing two `u128` keys is equivalent to lexicographically
// comparing the original inline bytes, followed by length.
u128::from_be_bytes(buf)
}
}

Expand Down Expand Up @@ -1164,22 +1220,35 @@ mod tests {
///
/// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why
/// the length field is required even when all inline bytes fit in 12 bytes.
///
/// The test includes strings that verify correct byte order (prevent reversal bugs),
/// and length-based tie-breaking in the composite key.
///
/// The test confirms that `inline_key_fast` produces keys which sort consistently
/// with the expected lexicographical order of the raw byte arrays.
#[test]
fn test_inline_key_fast_various_lengths_and_lexical() {
/// Helper to create a raw u128 value representing an inline ByteView
/// - `length`: number of meaningful bytes (≤ 12)
/// - `data`: the actual inline data
/// Helper to create a raw u128 value representing an inline ByteView:
/// - `length`: number of meaningful bytes (must be ≤ 12)
/// - `data`: the actual inline data bytes
///
/// The first 4 bytes encode length in little-endian,
/// the following 12 bytes contain the inline string data (unpadded).
fn make_raw_inline(length: u32, data: &[u8]) -> u128 {
assert!(length as usize <= 12, "Inline length must be ≤ 12");
assert!(data.len() == length as usize, "Data must match length");
assert!(
data.len() == length as usize,
"Data length must match `length`"
);

let mut raw_bytes = [0u8; 16];
raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // little-endian length
raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // length stored little-endian
raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data
u128::from_le_bytes(raw_bytes)
}

// Test inputs: include the specific "bar" vs "bar\0" case, plus length and lexical variations
// Test inputs: various lengths and lexical orders,
// plus special cases for byte order and length tie-breaking
let test_inputs: Vec<&[u8]> = vec![
b"a",
b"aa",
Expand All @@ -1193,23 +1262,42 @@ mod tests {
b"abcdefghi",
b"abcdefghij",
b"abcdefghijk",
b"abcdefghijkl", // 12 bytes, max inline
b"abcdefghijkl",
// Tests for byte-order reversal bug:
// Without the fix, "backend one" would compare as "eno dnekcab",
// causing incorrect sort order relative to "backend two".
b"backend one",
b"backend two",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we also please include strings that have the same exact inline prefix the length differ as well as content ?

Something like

  • LongerThan12Bytes
  • LongerThan12Bytez
  • LongerThan12Bytes\0
  • LongerThan12Byt

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, given the endian swapping going on, can we please also include a few strings that are more than 256 bytes long (so the length requires 2 bytes to store)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you @alamb for good point:

            b"than12Byt",
            b"than12Bytes",
            b"than12Bytes\0",
            b"than12Bytez",

I will add above cases, because inline_key_fast function only used for

<= 12 and <= 12 bytes to compare.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added in latest PR for above testing case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you @alamb I also found a way to do it, i adjust the fuzz testing to reproduce the cases, and this PR will make the fuzz testing works well.

// Tests length-tiebreaker logic:
// "bar" (3 bytes) and "bar\0" (4 bytes) have identical inline data,
// so only the length differentiates their ordering.
b"bar",
b"bar\0", // special case to test length tiebreaker
b"bar\0",
// Additional lexical and length tie-breaking cases with same prefix, in correct lex order:
b"than12Byt",
b"than12Bytes",
b"than12Bytes\0",
b"than12Bytesx",
b"than12Bytex",
b"than12Bytez",
// Additional lexical tests
b"xyy",
b"xyz",
b"xza",
];

// Monotonic key order: content then length,and cross-check against GenericBinaryArray comparison
// Create a GenericBinaryArray for cross-comparison of lex order
let array: GenericBinaryArray<i32> =
GenericBinaryArray::from(test_inputs.iter().map(|s| Some(*s)).collect::<Vec<_>>());

for i in 0..array.len() - 1 {
let v1 = array.value(i);
let v2 = array.value(i + 1);
// Ensure lexical ordering matches

// Assert the array's natural lexical ordering is correct
assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}");
// Ensure fast key compare matches

// Assert the keys produced by inline_key_fast reflect the same ordering
let key1 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
v1.len() as u32,
v1,
Expand All @@ -1218,6 +1306,7 @@ mod tests {
v2.len() as u32,
v2,
));

assert!(
key1 < key2,
"Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}",
Expand Down
31 changes: 30 additions & 1 deletion arrow-row/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2800,6 +2800,34 @@ mod tests {
.collect()
}

fn generate_fixed_stringview_column(len: usize) -> StringViewArray {
let edge_cases = vec![
Some("bar".to_string()),
Some("bar\0".to_string()),
Some("LongerThan12Bytes".to_string()),
Some("LongerThan12Bytez".to_string()),
Some("LongerThan12Bytes\0".to_string()),
Some("LongerThan12Byt".to_string()),
Some("backend one".to_string()),
Some("backend two".to_string()),
Some("a".repeat(257)),
Some("a".repeat(300)),
];

// Fill up to `len` by repeating edge cases and trimming
let mut values = Vec::with_capacity(len);
for i in 0..len {
values.push(
edge_cases
.get(i % edge_cases.len())
.cloned()
.unwrap_or(None),
);
}

StringViewArray::from(values)
}

fn generate_dictionary<K>(
values: ArrayRef,
len: usize,
Expand Down Expand Up @@ -2880,7 +2908,7 @@ mod tests {

fn generate_column(len: usize) -> ArrayRef {
let mut rng = rng();
match rng.random_range(0..16) {
match rng.random_range(0..17) {
0 => Arc::new(generate_primitive_array::<Int32Type>(len, 0.8)),
1 => Arc::new(generate_primitive_array::<UInt32Type>(len, 0.8)),
2 => Arc::new(generate_primitive_array::<Int64Type>(len, 0.8)),
Expand Down Expand Up @@ -2916,6 +2944,7 @@ mod tests {
})),
14 => Arc::new(generate_string_view(len, 0.8)),
15 => Arc::new(generate_byte_view(len, 0.8)),
16 => Arc::new(generate_fixed_stringview_column(len)),
_ => unreachable!(),
}
}
Expand Down
Loading