From 5968c4fa3f2b1992a8f95f2279705488db4e4f48 Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Mon, 23 Jun 2025 21:26:29 +0800
Subject: [PATCH 01/18] Perf: Change use of inline_value to inline it to a u128

---
 arrow-ord/src/cmp.rs | 88 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 72 insertions(+), 16 deletions(-)
diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs
index 46cab1bb8e4c..bcea7e4f74a6 100644
--- a/arrow-ord/src/cmp.rs
+++ b/arrow-ord/src/cmp.rs
@@ -589,17 +589,45 @@ impl<'a, T: ByteViewType> ArrayOrd for &'a GenericByteViewArray<T> {
 
     #[inline(always)]
     fn is_lt(l: Self::Item, r: Self::Item) -> bool {
+        // If both arrays use only the inline buffer
         if l.0.data_buffers().is_empty() && r.0.data_buffers().is_empty() {
-            let l_view = unsafe { l.0.views().get_unchecked(l.1) };
-            let r_view = unsafe { r.0.views().get_unchecked(r.1) };
-            let l_len = *l_view as u32 as usize;
-            let r_len = *r_view as u32 as usize;
-            let l_bytes = unsafe { GenericByteViewArray::<T>::inline_value(l_view, l_len) };
-            let r_bytes = unsafe { GenericByteViewArray::<T>::inline_value(r_view, r_len) };
-            return l_bytes.cmp(r_bytes).is_lt();
+            // Directly load the 16-byte view as an u128 (little-endian)
+            let l_bits: u128 = unsafe { *l.0.views().get_unchecked(l.1) };
+            let r_bits: u128 = unsafe { *r.0.views().get_unchecked(r.1) };
+
+            // The lower 32 bits encode the length (little-endian),
+            // the upper 96 bits hold the actual data
+            let l_len = (l_bits as u32) as usize;
+            let r_len = (r_bits as u32) as usize;
+
+            // Mask to keep only the upper 96 bits (data), zeroing out the length
+            // 0xFFFF_FFFF_0000_0000_..._0000
+            const DATA_MASK: u128 = !0u128 << 32;
+
+            // Remove the length bits, leaving only the data
+            let l_data = (l_bits & DATA_MASK) >> 32;
+            let r_data = (r_bits & DATA_MASK) >> 32;
+
+            // The data is stored in little-endian order. To compare lexicographically,
+            // convert to big-endian and use a simple < comparison:
+            let l_be = u128::from_be(l_data.to_le());
+            let r_be = u128::from_be(r_data.to_le());
+
+            // Compare only the first min_len bytes
+            let min_len = l_len.min(r_len);
+            // We have all 12 bytes in the high bits, but only want the top min_len
+            let shift = (12 - min_len) * 8;
+            let l_partial = l_be >> shift;
+            let r_partial = r_be >> shift;
+            if l_partial != r_partial {
+                return l_partial < r_partial;
+            }
+
+            // If the prefixes are equal, the shorter one is considered smaller
+            return l_len < r_len;
         }
-        // # Safety
-        // The index is within bounds as it is checked in value()
+
+        // Fallback to the generic, unchecked comparison for non-inline cases
         unsafe { GenericByteViewArray::compare_unchecked(l.0, l.1, r.0, r.1).is_lt() }
     }
 
@@ -643,13 +671,41 @@ pub fn compare_byte_view<T: ByteViewType>(
     assert!(left_idx < left.len());
     assert!(right_idx < right.len());
     if left.data_buffers().is_empty() && right.data_buffers().is_empty() {
-        let l_view = unsafe { left.views().get_unchecked(left_idx) };
-        let r_view = unsafe { right.views().get_unchecked(right_idx) };
-        let l_len = *l_view as u32 as usize;
-        let r_len = *r_view as u32 as usize;
-        let l_bytes = unsafe { GenericByteViewArray::<T>::inline_value(l_view, l_len) };
-        let r_bytes = unsafe { GenericByteViewArray::<T>::inline_value(r_view, r_len) };
-        return l_bytes.cmp(r_bytes);
+        // Directly load the 16-byte view as an u128 (little-endian)
+        let l_bits: u128 = unsafe { *left.views().get_unchecked(left_idx) };
+        let r_bits: u128 = unsafe { *left.views().get_unchecked(right_idx) };
+
+        // The lower 32 bits encode the length (little-endian),
+        // the upper 96 bits hold the actual data
+        let l_len = (l_bits as u32) as usize;
+        let r_len = (r_bits as u32) as usize;
+
+        // Mask to keep only the upper 96 bits (data), zeroing out the length
+        // 0xFFFF_FFFF_0000_0000_..._0000
+        const DATA_MASK: u128 = !0u128 << 32;
+
+        // Remove the length bits, leaving only the data
+        let l_data = (l_bits & DATA_MASK) >> 32;
+        let r_data = (r_bits & DATA_MASK) >> 32;
+
+        // The data is stored in little-endian order. To compare lexicographically,
+        // convert to big-endian and use a simple < comparison:
+        let l_be = u128::from_be(l_data.to_le());
+        let r_be = u128::from_be(r_data.to_le());
+
+        // Compare only the first min_len bytes
+        let min_len = l_len.min(r_len);
+        // We have all 12 bytes in the high bits, but only want the top min_len
+        let shift = (12 - min_len) * 8;
+        let l_partial = l_be >> shift;
+        let r_partial = r_be >> shift;
+        if l_partial < r_partial {
+            return std::cmp::Ordering::Less;
+        } else if l_partial > r_partial {
+            return std::cmp::Ordering::Greater;
+        }
+        // Prefix equal: shorter length is less
+        return l_len.cmp(&r_len);
     }
     unsafe { GenericByteViewArray::compare_unchecked(left, left_idx, right, right_idx) }
 }

From 061243c401e0ec0863b2218fc5ac0c246f7dc87a Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Mon, 23 Jun 2025 22:34:38 +0800
Subject: [PATCH 02/18] improve lt scalar

---
 arrow-array/src/array/byte_view_array.rs | 38 ++++++++++++++++++++++--
 arrow-ord/src/cmp.rs                     |  2 +-
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs
index 713e275d186c..0e3361e58969 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -545,9 +545,41 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
         let r_len = *r_view as u32;
 
         if l_len <= 12 && r_len <= 12 {
-            let l_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, l_len as usize) };
-            let r_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, r_len as usize) };
-            return l_data.cmp(r_data);
+            // Directly load the 16-byte view as an u128 (little-endian)
+            let l_bits: u128 = *l_view;
+            let r_bits: u128 = *r_view;
+
+            // The lower 32 bits encode the length (little-endian),
+            // the upper 96 bits hold the actual data
+            let l_len = (l_bits as u32) as usize;
+            let r_len = (r_bits as u32) as usize;
+
+            // Mask to keep only the upper 96 bits (data), zeroing out the length
+            // 0xFFFF_FFFF_0000_0000_..._0000
+            const DATA_MASK: u128 = !0u128 << 32;
+
+            // Remove the length bits, leaving only the data
+            let l_data = (l_bits & DATA_MASK) >> 32;
+            let r_data = (r_bits & DATA_MASK) >> 32;
+
+            // The data is stored in little-endian order. To compare lexicographically,
+            // convert to big-endian and use a simple < comparison:
+            let l_be = u128::from_be(l_data.to_le());
+            let r_be = u128::from_be(r_data.to_le());
+
+            // Compare only the first min_len bytes
+            let min_len = l_len.min(r_len);
+            // We have all 12 bytes in the high bits, but only want the top min_len
+            let shift = (12 - min_len) * 8;
+            let l_partial = l_be >> shift;
+            let r_partial = r_be >> shift;
+            if l_partial < r_partial {
+                return std::cmp::Ordering::Less;
+            } else if l_partial > r_partial {
+                return std::cmp::Ordering::Greater;
+            }
+            // Prefix equal: shorter length is less
+            return l_len.cmp(&r_len);
         }
 
         // one of the string is larger than 12 bytes,
diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs
index bcea7e4f74a6..550d190364ec 100644
--- a/arrow-ord/src/cmp.rs
+++ b/arrow-ord/src/cmp.rs
@@ -571,7 +571,7 @@ impl<'a, T: ByteViewType> ArrayOrd for &'a GenericByteViewArray<T> {
         let r_view = unsafe { r.0.views().get_unchecked(r.1) };
         if l.0.data_buffers().is_empty() && r.0.data_buffers().is_empty() {
             // For eq case, we can directly compare the inlined bytes
-            return l_view.cmp(r_view).is_eq();
+            return l_view.eq(r_view);
         }
 
         let l_len = *l_view as u32;

From 59ede5bb1219028a71cb6be8860b2a5da4018d16 Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Mon, 23 Jun 2025 22:41:13 +0800
Subject: [PATCH 03/18] comments

---
 arrow-ord/src/cmp.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs
index 550d190364ec..19e5b6f63211 100644
--- a/arrow-ord/src/cmp.rs
+++ b/arrow-ord/src/cmp.rs
@@ -628,6 +628,8 @@ impl<'a, T: ByteViewType> ArrayOrd for &'a GenericByteViewArray<T> {
         }
 
         // Fallback to the generic, unchecked comparison for non-inline cases
+        // # Safety
+        // The index is within bounds as it is checked in value()
         unsafe { GenericByteViewArray::compare_unchecked(l.0, l.1, r.0, r.1).is_lt() }
     }
 

From b805dfe8d6533d4df989436e3af9d813f6190abd Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Wed, 25 Jun 2025 18:11:18 +0800
Subject: [PATCH 04/18] continue improve performance

---
 arrow-array/src/array/byte_view_array.rs | 32 ++++++++--------
 arrow-ord/src/cmp.rs                     | 48 +++++++++++-------------
 2 files changed, 36 insertions(+), 44 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs
index 0e3361e58969..9e9d23890984 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -27,6 +27,7 @@ use arrow_schema::{ArrowError, DataType};
 use core::str;
 use num::ToPrimitive;
 use std::any::Any;
+use std::cmp::Ordering;
 use std::fmt::Debug;
 use std::marker::PhantomData;
 use std::sync::Arc;
@@ -537,7 +538,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
         left_idx: usize,
         right: &GenericByteViewArray<T>,
         right_idx: usize,
-    ) -> std::cmp::Ordering {
+    ) -> Ordering {
         let l_view = left.views().get_unchecked(left_idx);
         let l_len = *l_view as u32;
 
@@ -546,26 +547,22 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
 
         if l_len <= 12 && r_len <= 12 {
             // Directly load the 16-byte view as an u128 (little-endian)
-            let l_bits: u128 = *l_view;
-            let r_bits: u128 = *r_view;
+            let l_bits: u128 = unsafe { *left.views().get_unchecked(left_idx) };
+            let r_bits: u128 = unsafe { *right.views().get_unchecked(right_idx) };
 
             // The lower 32 bits encode the length (little-endian),
             // the upper 96 bits hold the actual data
-            let l_len = (l_bits as u32) as usize;
-            let r_len = (r_bits as u32) as usize;
-
-            // Mask to keep only the upper 96 bits (data), zeroing out the length
-            // 0xFFFF_FFFF_0000_0000_..._0000
-            const DATA_MASK: u128 = !0u128 << 32;
+            let l_len = l_bits as u32;
+            let r_len = r_bits as u32;
 
             // Remove the length bits, leaving only the data
-            let l_data = (l_bits & DATA_MASK) >> 32;
-            let r_data = (r_bits & DATA_MASK) >> 32;
+            let l_data = (l_bits >> 32) as u64;
+            let r_data = (r_bits >> 32) as u64;
 
             // The data is stored in little-endian order. To compare lexicographically,
-            // convert to big-endian and use a simple < comparison:
-            let l_be = u128::from_be(l_data.to_le());
-            let r_be = u128::from_be(r_data.to_le());
+            // convert to big-endian:
+            let l_be = l_data.swap_bytes();
+            let r_be = r_data.swap_bytes();
 
             // Compare only the first min_len bytes
             let min_len = l_len.min(r_len);
@@ -574,11 +571,12 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
             let l_partial = l_be >> shift;
             let r_partial = r_be >> shift;
             if l_partial < r_partial {
-                return std::cmp::Ordering::Less;
+                return Ordering::Less;
             } else if l_partial > r_partial {
-                return std::cmp::Ordering::Greater;
+                return Ordering::Greater;
             }
-            // Prefix equal: shorter length is less
+
+            // If the prefixes are equal, the shorter one is considered smaller
             return l_len.cmp(&r_len);
         }
 
diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs
index 9ed9d58ea4e2..3fcf1401a178 100644
--- a/arrow-ord/src/cmp.rs
+++ b/arrow-ord/src/cmp.rs
@@ -33,6 +33,7 @@ use arrow_buffer::bit_util::ceil;
 use arrow_buffer::{BooleanBuffer, MutableBuffer, NullBuffer};
 use arrow_schema::ArrowError;
 use arrow_select::take::take;
+use std::cmp::Ordering;
 use std::ops::Not;
 
 #[derive(Debug, Copy, Clone)]
@@ -600,21 +601,17 @@ impl<'a, T: ByteViewType> ArrayOrd for &'a GenericByteViewArray<T> {
 
             // The lower 32 bits encode the length (little-endian),
             // the upper 96 bits hold the actual data
-            let l_len = (l_bits as u32) as usize;
-            let r_len = (r_bits as u32) as usize;
-
-            // Mask to keep only the upper 96 bits (data), zeroing out the length
-            // 0xFFFF_FFFF_0000_0000_..._0000
-            const DATA_MASK: u128 = !0u128 << 32;
+            let l_len = l_bits as u32;
+            let r_len = r_bits as u32;
 
             // Remove the length bits, leaving only the data
-            let l_data = (l_bits & DATA_MASK) >> 32;
-            let r_data = (r_bits & DATA_MASK) >> 32;
+            let l_data = (l_bits >> 32) as u64;
+            let r_data = (r_bits >> 32) as u64;
 
             // The data is stored in little-endian order. To compare lexicographically,
-            // convert to big-endian and use a simple < comparison:
-            let l_be = u128::from_be(l_data.to_le());
-            let r_be = u128::from_be(r_data.to_le());
+            // convert to big-endian:
+            let l_be = l_data.swap_bytes();
+            let r_be = r_data.swap_bytes();
 
             // Compare only the first min_len bytes
             let min_len = l_len.min(r_len);
@@ -672,31 +669,27 @@ pub fn compare_byte_view<T: ByteViewType>(
     left_idx: usize,
     right: &GenericByteViewArray<T>,
     right_idx: usize,
-) -> std::cmp::Ordering {
+) -> Ordering {
     assert!(left_idx < left.len());
     assert!(right_idx < right.len());
     if left.data_buffers().is_empty() && right.data_buffers().is_empty() {
         // Directly load the 16-byte view as an u128 (little-endian)
         let l_bits: u128 = unsafe { *left.views().get_unchecked(left_idx) };
-        let r_bits: u128 = unsafe { *left.views().get_unchecked(right_idx) };
+        let r_bits: u128 = unsafe { *right.views().get_unchecked(right_idx) };
 
         // The lower 32 bits encode the length (little-endian),
         // the upper 96 bits hold the actual data
-        let l_len = (l_bits as u32) as usize;
-        let r_len = (r_bits as u32) as usize;
-
-        // Mask to keep only the upper 96 bits (data), zeroing out the length
-        // 0xFFFF_FFFF_0000_0000_..._0000
-        const DATA_MASK: u128 = !0u128 << 32;
+        let l_len = l_bits as u32;
+        let r_len = r_bits as u32;
 
         // Remove the length bits, leaving only the data
-        let l_data = (l_bits & DATA_MASK) >> 32;
-        let r_data = (r_bits & DATA_MASK) >> 32;
+        let l_data = (l_bits >> 32) as u64;
+        let r_data = (r_bits >> 32) as u64;
 
         // The data is stored in little-endian order. To compare lexicographically,
-        // convert to big-endian and use a simple < comparison:
-        let l_be = u128::from_be(l_data.to_le());
-        let r_be = u128::from_be(r_data.to_le());
+        // convert to big-endian:
+        let l_be = l_data.swap_bytes();
+        let r_be = r_data.swap_bytes();
 
         // Compare only the first min_len bytes
         let min_len = l_len.min(r_len);
@@ -705,11 +698,12 @@ pub fn compare_byte_view<T: ByteViewType>(
         let l_partial = l_be >> shift;
         let r_partial = r_be >> shift;
         if l_partial < r_partial {
-            return std::cmp::Ordering::Less;
+            return Ordering::Less;
         } else if l_partial > r_partial {
-            return std::cmp::Ordering::Greater;
+            return Ordering::Greater;
         }
-        // Prefix equal: shorter length is less
+
+        // If the prefixes are equal, the shorter one is considered smaller
         return l_len.cmp(&r_len);
     }
     unsafe { GenericByteViewArray::compare_unchecked(left, left_idx, right, right_idx) }

From 59802979bbbb5608d4b1dc9a986bee34f8cf3eb1 Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Wed, 25 Jun 2025 18:14:39 +0800
Subject: [PATCH 05/18] fix

---
 arrow-array/src/array/byte_view_array.rs |  8 ++++----
 arrow-ord/src/cmp.rs                     | 16 ++++++++--------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs
index 9e9d23890984..d9df8a2f3b1d 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -552,12 +552,12 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
 
             // The lower 32 bits encode the length (little-endian),
             // the upper 96 bits hold the actual data
-            let l_len = l_bits as u32;
-            let r_len = r_bits as u32;
+            let l_len = l_bits as u32 as u64;
+            let r_len = r_bits as u32 as u64;
 
             // Remove the length bits, leaving only the data
-            let l_data = (l_bits >> 32) as u64;
-            let r_data = (r_bits >> 32) as u64;
+            let l_data = l_bits >> 32;
+            let r_data = r_bits >> 32;
 
             // The data is stored in little-endian order. To compare lexicographically,
             // convert to big-endian:
diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs
index 3fcf1401a178..5236e7512f49 100644
--- a/arrow-ord/src/cmp.rs
+++ b/arrow-ord/src/cmp.rs
@@ -601,12 +601,12 @@ impl<'a, T: ByteViewType> ArrayOrd for &'a GenericByteViewArray<T> {
 
             // The lower 32 bits encode the length (little-endian),
             // the upper 96 bits hold the actual data
-            let l_len = l_bits as u32;
-            let r_len = r_bits as u32;
+            let l_len = l_bits as u32 as u64;
+            let r_len = r_bits as u32 as u64;
 
             // Remove the length bits, leaving only the data
-            let l_data = (l_bits >> 32) as u64;
-            let r_data = (r_bits >> 32) as u64;
+            let l_data = l_bits >> 32;
+            let r_data = r_bits >> 32;
 
             // The data is stored in little-endian order. To compare lexicographically,
             // convert to big-endian:
@@ -679,12 +679,12 @@ pub fn compare_byte_view<T: ByteViewType>(
 
         // The lower 32 bits encode the length (little-endian),
         // the upper 96 bits hold the actual data
-        let l_len = l_bits as u32;
-        let r_len = r_bits as u32;
+        let l_len = l_bits as u32 as u64;
+        let r_len = r_bits as u32 as u64;
 
         // Remove the length bits, leaving only the data
-        let l_data = (l_bits >> 32) as u64;
-        let r_data = (r_bits >> 32) as u64;
+        let l_data = l_bits >> 32;
+        let r_data = r_bits >> 32;
 
         // The data is stored in little-endian order. To compare lexicographically,
         // convert to big-endian:

From 6fb2f8437ec294030cd5b734267eff6b0704617a Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Wed, 25 Jun 2025 18:22:27 +0800
Subject: [PATCH 06/18] optimize u64 to u32

---
 arrow-array/src/array/byte_view_array.rs | 4 ++--
 arrow-ord/src/cmp.rs                     | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs
index d9df8a2f3b1d..218429173467 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -552,8 +552,8 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
 
             // The lower 32 bits encode the length (little-endian),
             // the upper 96 bits hold the actual data
-            let l_len = l_bits as u32 as u64;
-            let r_len = r_bits as u32 as u64;
+            let l_len = l_bits as u32;
+            let r_len = r_bits as u32;
 
             // Remove the length bits, leaving only the data
             let l_data = l_bits >> 32;
diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs
index 5236e7512f49..c4575883645c 100644
--- a/arrow-ord/src/cmp.rs
+++ b/arrow-ord/src/cmp.rs
@@ -601,8 +601,8 @@ impl<'a, T: ByteViewType> ArrayOrd for &'a GenericByteViewArray<T> {
 
             // The lower 32 bits encode the length (little-endian),
             // the upper 96 bits hold the actual data
-            let l_len = l_bits as u32 as u64;
-            let r_len = r_bits as u32 as u64;
+            let l_len = l_bits as u32;
+            let r_len = r_bits as u32;
 
             // Remove the length bits, leaving only the data
             let l_data = l_bits >> 32;
@@ -679,8 +679,8 @@ pub fn compare_byte_view<T: ByteViewType>(
 
         // The lower 32 bits encode the length (little-endian),
         // the upper 96 bits hold the actual data
-        let l_len = l_bits as u32 as u64;
-        let r_len = r_bits as u32 as u64;
+        let l_len = l_bits as u32;
+        let r_len = r_bits as u32;
 
         // Remove the length bits, leaving only the data
         let l_data = l_bits >> 32;

From 3428523ef38d25f0187218a965f9247b23b906c8 Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Wed, 25 Jun 2025 18:31:33 +0800
Subject: [PATCH 07/18] reduce memory access count

---
 arrow-array/src/array/byte_view_array.rs | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs
index 218429173467..e41a53a23d3d 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -546,18 +546,9 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
         let r_len = *r_view as u32;
 
         if l_len <= 12 && r_len <= 12 {
-            // Directly load the 16-byte view as an u128 (little-endian)
-            let l_bits: u128 = unsafe { *left.views().get_unchecked(left_idx) };
-            let r_bits: u128 = unsafe { *right.views().get_unchecked(right_idx) };
-
-            // The lower 32 bits encode the length (little-endian),
-            // the upper 96 bits hold the actual data
-            let l_len = l_bits as u32;
-            let r_len = r_bits as u32;
-
             // Remove the length bits, leaving only the data
-            let l_data = l_bits >> 32;
-            let r_data = r_bits >> 32;
+            let l_data = *l_view >> 32;
+            let r_data = *r_view >> 32;
 
             // The data is stored in little-endian order. To compare lexicographically,
             // convert to big-endian:

From 1b5a1da59da356c7a7a634cd314c3ed900f3a944 Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Wed, 25 Jun 2025 21:32:25 +0800
Subject: [PATCH 08/18] continue improve performance

---
 arrow-array/src/array/byte_view_array.rs | 58 +++++++++---------
 arrow-ord/src/cmp.rs                     | 76 +++++-------------------
 2 files changed, 43 insertions(+), 91 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs
index e41a53a23d3d..0c2ef7e8cebe 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -540,43 +540,24 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
         right_idx: usize,
     ) -> Ordering {
         let l_view = left.views().get_unchecked(left_idx);
-        let l_len = *l_view as u32;
+        let l_byte_view = ByteView::from(*l_view);
 
         let r_view = right.views().get_unchecked(right_idx);
-        let r_len = *r_view as u32;
+        let r_byte_view = ByteView::from(*r_view);
 
-        if l_len <= 12 && r_len <= 12 {
-            // Remove the length bits, leaving only the data
-            let l_data = *l_view >> 32;
-            let r_data = *r_view >> 32;
-
-            // The data is stored in little-endian order. To compare lexicographically,
-            // convert to big-endian:
-            let l_be = l_data.swap_bytes();
-            let r_be = r_data.swap_bytes();
-
-            // Compare only the first min_len bytes
-            let min_len = l_len.min(r_len);
-            // We have all 12 bytes in the high bits, but only want the top min_len
-            let shift = (12 - min_len) * 8;
-            let l_partial = l_be >> shift;
-            let r_partial = r_be >> shift;
-            if l_partial < r_partial {
-                return Ordering::Less;
-            } else if l_partial > r_partial {
-                return Ordering::Greater;
-            }
+        let l_len = l_byte_view.length;
+        let r_len = r_byte_view.length;
 
-            // If the prefixes are equal, the shorter one is considered smaller
-            return l_len.cmp(&r_len);
+        if l_len <= 12 && r_len <= 12 {
+            return Self::inline_key_fast(l_byte_view).cmp(&Self::inline_key_fast(r_byte_view));
         }
 
         // one of the string is larger than 12 bytes,
         // we then try to compare the inlined data first
-        let l_inlined_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, 4) };
-        let r_inlined_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, 4) };
-        if r_inlined_data != l_inlined_data {
-            return l_inlined_data.cmp(r_inlined_data);
+        let l_inlined_be = l_byte_view.prefix.swap_bytes();
+        let r_inlined_be = r_byte_view.prefix.swap_bytes();
+        if l_inlined_be != r_inlined_be {
+            return l_inlined_be.cmp(&r_inlined_be);
         }
 
         // unfortunately, we need to compare the full data
@@ -585,6 +566,25 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
 
         l_full_data.cmp(r_full_data)
     }
+
+    /// Builds a 128-bit composite key for an inline value:
+    /// - High 96 bits: the inline data, in big-endian order
+    /// - Low  32 bits: the length, in big-endian order (so shorter is always numerically smaller)
+    #[inline(always)]
+    pub fn inline_key_fast(raw: ByteView) -> u128 {
+        // Convert each to big-endian (so their bytes go MSB→LSB in correct lexical order)
+        let a = raw.prefix.to_be() as u128; // data[0..4]
+        let b = raw.buffer_index.to_be() as u128; // data[4..8]
+        let c = raw.offset.to_be() as u128; // data[8..12]
+        let d = raw.length.to_be() as u128; // length
+
+        // Pack them into one u128:
+        // bits 127..96 = a
+        // bits 95..64  = b
+        // bits 63..32  = c
+        // bits 31..0   = d
+        (a << 96) | (b << 64) | (c << 32) | d
+    }
 }
 
 impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs
index c4575883645c..15fb28a2ef70 100644
--- a/arrow-ord/src/cmp.rs
+++ b/arrow-ord/src/cmp.rs
@@ -31,6 +31,7 @@ use arrow_array::{
 };
 use arrow_buffer::bit_util::ceil;
 use arrow_buffer::{BooleanBuffer, MutableBuffer, NullBuffer};
+use arrow_data::ByteView;
 use arrow_schema::ArrowError;
 use arrow_select::take::take;
 use std::cmp::Ordering;
@@ -595,36 +596,13 @@ impl<'a, T: ByteViewType> ArrayOrd for &'a GenericByteViewArray<T> {
     fn is_lt(l: Self::Item, r: Self::Item) -> bool {
         // If both arrays use only the inline buffer
         if l.0.data_buffers().is_empty() && r.0.data_buffers().is_empty() {
-            // Directly load the 16-byte view as an u128 (little-endian)
-            let l_bits: u128 = unsafe { *l.0.views().get_unchecked(l.1) };
-            let r_bits: u128 = unsafe { *r.0.views().get_unchecked(r.1) };
-
-            // The lower 32 bits encode the length (little-endian),
-            // the upper 96 bits hold the actual data
-            let l_len = l_bits as u32;
-            let r_len = r_bits as u32;
-
-            // Remove the length bits, leaving only the data
-            let l_data = l_bits >> 32;
-            let r_data = r_bits >> 32;
-
-            // The data is stored in little-endian order. To compare lexicographically,
-            // convert to big-endian:
-            let l_be = l_data.swap_bytes();
-            let r_be = r_data.swap_bytes();
-
-            // Compare only the first min_len bytes
-            let min_len = l_len.min(r_len);
-            // We have all 12 bytes in the high bits, but only want the top min_len
-            let shift = (12 - min_len) * 8;
-            let l_partial = l_be >> shift;
-            let r_partial = r_be >> shift;
-            if l_partial != r_partial {
-                return l_partial < r_partial;
-            }
-
-            // If the prefixes are equal, the shorter one is considered smaller
-            return l_len < r_len;
+            let l_view = unsafe { l.0.views().get_unchecked(l.1) };
+            let r_view = unsafe { r.0.views().get_unchecked(r.1) };
+            let l_byte_view = ByteView::from(*l_view);
+            let r_byte_view = ByteView::from(*r_view);
+            return GenericByteViewArray::<T>::inline_key_fast(l_byte_view)
+                .cmp(&GenericByteViewArray::<T>::inline_key_fast(r_byte_view))
+                .is_lt();
         }
 
         // Fallback to the generic, unchecked comparison for non-inline cases
@@ -673,38 +651,12 @@ pub fn compare_byte_view<T: ByteViewType>(
     assert!(left_idx < left.len());
     assert!(right_idx < right.len());
     if left.data_buffers().is_empty() && right.data_buffers().is_empty() {
-        // Directly load the 16-byte view as an u128 (little-endian)
-        let l_bits: u128 = unsafe { *left.views().get_unchecked(left_idx) };
-        let r_bits: u128 = unsafe { *right.views().get_unchecked(right_idx) };
-
-        // The lower 32 bits encode the length (little-endian),
-        // the upper 96 bits hold the actual data
-        let l_len = l_bits as u32;
-        let r_len = r_bits as u32;
-
-        // Remove the length bits, leaving only the data
-        let l_data = l_bits >> 32;
-        let r_data = r_bits >> 32;
-
-        // The data is stored in little-endian order. To compare lexicographically,
-        // convert to big-endian:
-        let l_be = l_data.swap_bytes();
-        let r_be = r_data.swap_bytes();
-
-        // Compare only the first min_len bytes
-        let min_len = l_len.min(r_len);
-        // We have all 12 bytes in the high bits, but only want the top min_len
-        let shift = (12 - min_len) * 8;
-        let l_partial = l_be >> shift;
-        let r_partial = r_be >> shift;
-        if l_partial < r_partial {
-            return Ordering::Less;
-        } else if l_partial > r_partial {
-            return Ordering::Greater;
-        }
-
-        // If the prefixes are equal, the shorter one is considered smaller
-        return l_len.cmp(&r_len);
+        let l_view = unsafe { left.views().get_unchecked(left_idx) };
+        let r_view = unsafe { right.views().get_unchecked(right_idx) };
+        let l_byte_view = ByteView::from(*l_view);
+        let r_byte_view = ByteView::from(*r_view);
+        return GenericByteViewArray::<T>::inline_key_fast(l_byte_view)
+            .cmp(&GenericByteViewArray::<T>::inline_key_fast(r_byte_view));
     }
     unsafe { GenericByteViewArray::compare_unchecked(left, left_idx, right, right_idx) }
 }

From 8d4da6ab5d1bfd931f6bd02e5ce83a1e2e2ba97d Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Wed, 25 Jun 2025 21:34:22 +0800
Subject: [PATCH 09/18] address comments

---
 arrow-ord/src/cmp.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs
index 15fb28a2ef70..1a06ae48e0b5 100644
--- a/arrow-ord/src/cmp.rs
+++ b/arrow-ord/src/cmp.rs
@@ -573,7 +573,7 @@ impl<'a, T: ByteViewType> ArrayOrd for &'a GenericByteViewArray<T> {
         let r_view = unsafe { r.0.views().get_unchecked(r.1) };
         if l.0.data_buffers().is_empty() && r.0.data_buffers().is_empty() {
             // For eq case, we can directly compare the inlined bytes
-            return l_view.eq(r_view);
+            return l_view == r_view;
         }
 
         let l_len = *l_view as u32;

From 52c2fc33e4f53a1696bea0cec0b8ffd972181cb8 Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Wed, 25 Jun 2025 21:35:27 +0800
Subject: [PATCH 10/18] clean code

---
 arrow-ord/src/cmp.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs
index 1a06ae48e0b5..a5c51fc10544 100644
--- a/arrow-ord/src/cmp.rs
+++ b/arrow-ord/src/cmp.rs
@@ -601,8 +601,7 @@ impl<'a, T: ByteViewType> ArrayOrd for &'a GenericByteViewArray<T> {
             let l_byte_view = ByteView::from(*l_view);
             let r_byte_view = ByteView::from(*r_view);
             return GenericByteViewArray::<T>::inline_key_fast(l_byte_view)
-                .cmp(&GenericByteViewArray::<T>::inline_key_fast(r_byte_view))
-                .is_lt();
+                 < GenericByteViewArray::<T>::inline_key_fast(r_byte_view)
         }
 
         // Fallback to the generic, unchecked comparison for non-inline cases

From c8c7038054ed614f0062c514be58807e32f35ee3 Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Wed, 25 Jun 2025 22:02:17 +0800
Subject: [PATCH 11/18] optimize

---
 arrow-array/src/array/byte_view_array.rs | 41 ++++++++++++++++--------
 arrow-ord/src/cmp.rs                     | 13 +++-----
 2 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs
index 0c2ef7e8cebe..15b3fc1afa5e 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -549,11 +549,15 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
         let r_len = r_byte_view.length;
 
         if l_len <= 12 && r_len <= 12 {
-            return Self::inline_key_fast(l_byte_view).cmp(&Self::inline_key_fast(r_byte_view));
+            return Self::inline_key_fast(*l_view).cmp(&Self::inline_key_fast(*r_view));
         }
 
         // one of the string is larger than 12 bytes,
         // we then try to compare the inlined data first
+
+        // Note: In theory, ByteView is only used for views larger than 12 bytes,
+        // but we can still use it to get the inlined prefix for shorter strings.
+        // The prefix is always the first 4 bytes of the view, for both short and long strings.
         let l_inlined_be = l_byte_view.prefix.swap_bytes();
         let r_inlined_be = r_byte_view.prefix.swap_bytes();
         if l_inlined_be != r_inlined_be {
@@ -571,19 +575,28 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
     /// - High 96 bits: the inline data, in big-endian order
     /// - Low  32 bits: the length, in big-endian order (so shorter is always numerically smaller)
     #[inline(always)]
-    pub fn inline_key_fast(raw: ByteView) -> u128 {
-        // Convert each to big-endian (so their bytes go MSB→LSB in correct lexical order)
-        let a = raw.prefix.to_be() as u128; // data[0..4]
-        let b = raw.buffer_index.to_be() as u128; // data[4..8]
-        let c = raw.offset.to_be() as u128; // data[8..12]
-        let d = raw.length.to_be() as u128; // length
-
-        // Pack them into one u128:
-        // bits 127..96 = a
-        // bits 95..64  = b
-        // bits 63..32  = c
-        // bits 31..0   = d
-        (a << 96) | (b << 64) | (c << 32) | d
+    pub fn inline_key_fast(raw: u128) -> u128 {
+        // Convert the raw u128 (little-endian) into bytes for manipulation
+        let raw_bytes = raw.to_le_bytes();
+
+        // Extract the length (first 4 bytes), convert to big-endian u32, and promote to u128
+        let len_le = &raw_bytes[0..4];
+        let len_be = u32::from_le_bytes(len_le.try_into().unwrap()).to_be() as u128;
+
+        // Extract the inline string bytes (next 12 bytes), place them into the lower 12 bytes of a 16-byte array,
+        // padding the upper 4 bytes with zero to form a little-endian u128 value
+        let mut inline_bytes = [0u8; 16];
+        inline_bytes[4..16].copy_from_slice(&raw_bytes[4..16]);
+
+        // Convert to big-endian to ensure correct lexical ordering
+        let inline_u128 = u128::from_le_bytes(inline_bytes).to_be();
+
+        // Shift right by 32 bits to discard the zero padding (upper 4 bytes),
+        // so that the inline string occupies the high 96 bits
+        let inline_part = inline_u128 >> 32;
+
+        // Combine the inline string part (high 96 bits) and length (low 32 bits) into the final key
+        (inline_part << 32) | len_be
     }
 }
 
diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs
index a5c51fc10544..177bf8e82b6a 100644
--- a/arrow-ord/src/cmp.rs
+++ b/arrow-ord/src/cmp.rs
@@ -31,7 +31,6 @@ use arrow_array::{
 };
 use arrow_buffer::bit_util::ceil;
 use arrow_buffer::{BooleanBuffer, MutableBuffer, NullBuffer};
-use arrow_data::ByteView;
 use arrow_schema::ArrowError;
 use arrow_select::take::take;
 use std::cmp::Ordering;
@@ -598,10 +597,8 @@ impl<'a, T: ByteViewType> ArrayOrd for &'a GenericByteViewArray<T> {
         if l.0.data_buffers().is_empty() && r.0.data_buffers().is_empty() {
             let l_view = unsafe { l.0.views().get_unchecked(l.1) };
             let r_view = unsafe { r.0.views().get_unchecked(r.1) };
-            let l_byte_view = ByteView::from(*l_view);
-            let r_byte_view = ByteView::from(*r_view);
-            return GenericByteViewArray::<T>::inline_key_fast(l_byte_view)
-                 < GenericByteViewArray::<T>::inline_key_fast(r_byte_view)
+            return GenericByteViewArray::<T>::inline_key_fast(*l_view)
+                 < GenericByteViewArray::<T>::inline_key_fast(*r_view)
         }
 
         // Fallback to the generic, unchecked comparison for non-inline cases
@@ -652,10 +649,8 @@ pub fn compare_byte_view<T: ByteViewType>(
     if left.data_buffers().is_empty() && right.data_buffers().is_empty() {
         let l_view = unsafe { left.views().get_unchecked(left_idx) };
         let r_view = unsafe { right.views().get_unchecked(right_idx) };
-        let l_byte_view = ByteView::from(*l_view);
-        let r_byte_view = ByteView::from(*r_view);
-        return GenericByteViewArray::<T>::inline_key_fast(l_byte_view)
-            .cmp(&GenericByteViewArray::<T>::inline_key_fast(r_byte_view));
+        return GenericByteViewArray::<T>::inline_key_fast(*l_view)
+            .cmp(&GenericByteViewArray::<T>::inline_key_fast(*r_view));
     }
     unsafe { GenericByteViewArray::compare_unchecked(left, left_idx, right, right_idx) }
 }

From 7194e8d22d2f8bc5c686ef3777487ac0fa647c3e Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Wed, 25 Jun 2025 22:07:09 +0800
Subject: [PATCH 12/18] Add more comments

---
 arrow-array/src/array/byte_view_array.rs | 16 ++++++++++++++--
 arrow-ord/src/cmp.rs                     |  2 +-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs
index 15b3fc1afa5e..28bc81ca5057 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -572,8 +572,20 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
     }
 
     /// Builds a 128-bit composite key for an inline value:
-    /// - High 96 bits: the inline data, in big-endian order
-    /// - Low  32 bits: the length, in big-endian order (so shorter is always numerically smaller)
+    /// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting)
+    /// - Low  32 bits: the length in big-endian byte order (so shorter strings are always numerically smaller)
+    ///
+    /// This function extracts the length and the 12-byte inline string data from the
+    /// raw little-endian u128 representation, converts them to big-endian ordering,
+    /// and packs them into a single u128 value suitable for fast comparisons.
+    ///
+    /// # Note
+    /// The input `raw` is assumed to be in little-endian format with the following layout:
+    /// - bytes 0..4: length (u32)
+    /// - bytes 4..16: inline string data (padded with zeros if less than 12 bytes)
+    ///
+    /// The output u128 key places the inline string data in the upper 96 bits (big-endian)
+    /// and the length in the lower 32 bits (big-endian).
     #[inline(always)]
     pub fn inline_key_fast(raw: u128) -> u128 {
         // Convert the raw u128 (little-endian) into bytes for manipulation
diff --git a/arrow-ord/src/cmp.rs b/arrow-ord/src/cmp.rs
index 177bf8e82b6a..f9ab80844d1f 100644
--- a/arrow-ord/src/cmp.rs
+++ b/arrow-ord/src/cmp.rs
@@ -598,7 +598,7 @@ impl<'a, T: ByteViewType> ArrayOrd for &'a GenericByteViewArray<T> {
             let l_view = unsafe { l.0.views().get_unchecked(l.1) };
             let r_view = unsafe { r.0.views().get_unchecked(r.1) };
             return GenericByteViewArray::<T>::inline_key_fast(*l_view)
-                 < GenericByteViewArray::<T>::inline_key_fast(*r_view)
+                < GenericByteViewArray::<T>::inline_key_fast(*r_view);
         }
 
         // Fallback to the generic, unchecked comparison for non-inline cases

From 1a858a1d9013f6b495d2d89e036a8ab662d5da90 Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Wed, 25 Jun 2025 22:25:41 +0800
Subject: [PATCH 13/18] Add unit test

---
 arrow-array/src/array/byte_view_array.rs | 53 +++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs
index 28bc81ca5057..646c95c412fc 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -918,9 +918,10 @@ impl From<Vec<Option<String>>> for StringViewArray {
 #[cfg(test)]
 mod tests {
     use crate::builder::{BinaryViewBuilder, StringViewBuilder};
-    use crate::{Array, BinaryViewArray, StringViewArray};
+    use crate::{Array, BinaryViewArray, GenericByteViewArray, StringViewArray};
     use arrow_buffer::{Buffer, ScalarBuffer};
     use arrow_data::ByteView;
+    use crate::types::BinaryViewType;
 
     #[test]
     fn try_new_string() {
@@ -1134,4 +1135,54 @@ mod tests {
         assert_eq!(array2, array2.clone());
         assert_eq!(array1, array2);
     }
+
+    #[test]
+    fn test_inline_key_fast_various_lengths() {
+        /// Helper to create a raw u128 value representing an inline ByteView
+        /// - `length`: number of meaningful bytes (<= 12)
+        /// - `data`: the actual inline data (prefix + padding)
+        fn make_raw_inline(length: u32, data: &[u8]) -> u128 {
+            assert!(length as usize <= 12, "Inline length must be ≤ 12");
+            assert!(data.len() == length as usize, "Data must match length");
+
+            let mut raw_bytes = [0u8; 16];
+            raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // little-endian length
+            raw_bytes[4..(4 + data.len())].copy_from_slice(data);    // inline data
+            u128::from_le_bytes(raw_bytes)
+        }
+
+        // Test multiple inline values of increasing length and content
+        let test_inputs: Vec<&[u8]> = vec![
+            b"a",
+            b"ab",
+            b"abc",
+            b"abcd",
+            b"abcde",
+            b"abcdef",
+            b"abcdefg",
+            b"abcdefgh",
+            b"abcdefghi",
+            b"abcdefghij",
+            b"abcdefghijk",
+            b"abcdefghijkl", // 12 bytes, max inline
+        ];
+
+        let mut previous_key = None;
+
+        for input in test_inputs {
+            let raw = make_raw_inline(input.len() as u32, input);
+            let key = GenericByteViewArray::<BinaryViewType>::inline_key_fast(raw);
+
+            // Validate that keys are monotonically increasing in lexicographic+length order
+            if let Some(prev) = previous_key {
+                assert!(
+                    prev < key,
+                    "Key for {:?} was not greater than previous key",
+                    input
+                );
+            }
+
+            previous_key = Some(key);
+        }
+    }
 }

From 442402b7f14947e2e123d97141b7eaeda76a00cd Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Wed, 25 Jun 2025 22:26:03 +0800
Subject: [PATCH 14/18] fmt

---
 arrow-array/src/array/byte_view_array.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs
index 646c95c412fc..905ef537d578 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -918,10 +918,10 @@ impl From<Vec<Option<String>>> for StringViewArray {
 #[cfg(test)]
 mod tests {
     use crate::builder::{BinaryViewBuilder, StringViewBuilder};
+    use crate::types::BinaryViewType;
     use crate::{Array, BinaryViewArray, GenericByteViewArray, StringViewArray};
     use arrow_buffer::{Buffer, ScalarBuffer};
     use arrow_data::ByteView;
-    use crate::types::BinaryViewType;
 
     #[test]
     fn try_new_string() {
@@ -1147,7 +1147,7 @@ mod tests {
 
             let mut raw_bytes = [0u8; 16];
             raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // little-endian length
-            raw_bytes[4..(4 + data.len())].copy_from_slice(data);    // inline data
+            raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data
             u128::from_le_bytes(raw_bytes)
         }
 

From 9580de1fad98c7fe3eba474aba7e1b45c80a79dc Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Fri, 27 Jun 2025 23:06:24 +0800
Subject: [PATCH 15/18] Add more testing and comments, and address comments

---
 arrow-array/src/array/byte_view_array.rs | 126 ++++++++++++++++++-----
 1 file changed, 100 insertions(+), 26 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs
index 2b791a51beda..398e39356bfb 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -574,20 +574,53 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
     }
 
     /// Builds a 128-bit composite key for an inline value:
-    /// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting)
-    /// - Low  32 bits: the length in big-endian byte order (so shorter strings are always numerically smaller)
     ///
-    /// This function extracts the length and the 12-byte inline string data from the
-    /// raw little-endian u128 representation, converts them to big-endian ordering,
-    /// and packs them into a single u128 value suitable for fast comparisons.
+    /// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting).
+    /// - Low  32 bits: the length in big-endian byte order, acting as a tiebreaker so shorter strings
+    ///   (or those with fewer meaningful bytes) always numerically sort before longer ones.
     ///
-    /// # Note
-    /// The input `raw` is assumed to be in little-endian format with the following layout:
-    /// - bytes 0..4: length (u32)
-    /// - bytes 4..16: inline string data (padded with zeros if less than 12 bytes)
+    /// This function extracts the length and the 12-byte inline string data from the raw
+    /// little-endian `u128` representation, converts them to big-endian ordering, and packs them
+    /// into a single `u128` value suitable for fast, branchless comparisons.
     ///
-    /// The output u128 key places the inline string data in the upper 96 bits (big-endian)
-    /// and the length in the lower 32 bits (big-endian).
+    /// ### Why include length?
+    ///
+    /// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes
+    /// compare equal—either because one is a true prefix of the other or because zero-padding
+    /// hides extra bytes. By tucking the 32-bit length into the lower bits, a single `u128` compare
+    /// handles both content and length in one go.
+    ///
+    /// Example: comparing "bar" (3 bytes) vs "bar\0" (4 bytes)
+    ///
+    /// | String     | Bytes 0–4 (length LE) | Bytes 4–16 (data + padding)    |
+    /// |------------|-----------------------|---------------------------------|
+    /// | `"bar"`   | `03 00 00 00`         | `62 61 72` + 9 × `00`           |
+    /// | `"bar\0"`| `04 00 00 00`         | `62 61 72 00` + 8 × `00`        |
+    ///
+    /// Both inline parts become `62 61 72 00…00`, so they tie on content. The length field
+    /// then differentiates:
+    ///
+    /// ```text
+    /// key("bar")   = 0x0000000000000000000062617200000003
+    /// key("bar\0") = 0x0000000000000000000062617200000004
+    /// ⇒ key("bar") < key("bar\0")
+    /// ```
+    ///
+    /// Background (naive implementation, two-pass):
+    ///
+    ///     fn compare_strings(a: &RawInline, b: &RawInline) -> Ordering {
+    ///         // 1) Compare up to 12 bytes of content
+    ///         let ord = compare_u96(a.inline_bytes_be(), b.inline_bytes_be());
+    ///         if ord != Ordering::Equal {
+    ///             return ord;
+    ///         }
+    ///         // 2) Tiebreaker: compare lengths
+    ///         a.len().cmp(&b.len())
+    ///     }
+    ///
+    /// Folding content and length into one `u128` lets you do:
+    ///
+    ///     key(a).cmp(&key(b))
     #[inline(always)]
     pub fn inline_key_fast(raw: u128) -> u128 {
         // Convert the raw u128 (little-endian) into bytes for manipulation
@@ -921,7 +954,9 @@ impl From<Vec<Option<String>>> for StringViewArray {
 mod tests {
     use crate::builder::{BinaryViewBuilder, StringViewBuilder};
     use crate::types::BinaryViewType;
-    use crate::{Array, BinaryViewArray, GenericByteViewArray, StringViewArray};
+    use crate::{
+        Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray,
+    };
     use arrow_buffer::{Buffer, ScalarBuffer};
     use arrow_data::ByteView;
 
@@ -1138,11 +1173,18 @@ mod tests {
         assert_eq!(array1, array2);
     }
 
+    /// Integration tests for `inline_key_fast` covering:
+    ///
+    /// 1. Monotonic ordering across increasing lengths and lexical variations.
+    /// 2. Cross-check against `GenericBinaryArray` comparison to ensure semantic equivalence.
+    ///
+    /// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why
+    /// the length field is required even when all inline bytes fit in 12 bytes.
     #[test]
-    fn test_inline_key_fast_various_lengths() {
+    fn test_inline_key_fast_various_lengths_and_lexical() {
         /// Helper to create a raw u128 value representing an inline ByteView
-        /// - `length`: number of meaningful bytes (<= 12)
-        /// - `data`: the actual inline data (prefix + padding)
+        /// - `length`: number of meaningful bytes (≤ 12)
+        /// - `data`: the actual inline data
         fn make_raw_inline(length: u32, data: &[u8]) -> u128 {
             assert!(length as usize <= 12, "Inline length must be ≤ 12");
             assert!(data.len() == length as usize, "Data must match length");
@@ -1153,11 +1195,12 @@ mod tests {
             u128::from_le_bytes(raw_bytes)
         }
 
-        // Test multiple inline values of increasing length and content
+        // Test inputs: include the specific "bar" vs "bar\0" case, plus length and lexical variations
         let test_inputs: Vec<&[u8]> = vec![
             b"a",
-            b"ab",
-            b"abc",
+            b"aa",
+            b"aaa",
+            b"aab",
             b"abcd",
             b"abcde",
             b"abcdef",
@@ -1167,24 +1210,55 @@ mod tests {
             b"abcdefghij",
             b"abcdefghijk",
             b"abcdefghijkl", // 12 bytes, max inline
+            b"bar",
+            b"bar\0", // special case to test length tiebreaker
+            b"xyy",
+            b"xyz",
         ];
 
-        let mut previous_key = None;
-
-        for input in test_inputs {
+        // 1) Monotonic key order: content then length
+        let mut previous_key: Option<u128> = None;
+        for input in &test_inputs {
             let raw = make_raw_inline(input.len() as u32, input);
             let key = GenericByteViewArray::<BinaryViewType>::inline_key_fast(raw);
-
-            // Validate that keys are monotonically increasing in lexicographic+length order
             if let Some(prev) = previous_key {
                 assert!(
                     prev < key,
-                    "Key for {:?} was not greater than previous key",
-                    input
+                    "Key for {:?} (0x{:032x}) was not less than next key (0x{:032x})",
+                    input,
+                    prev,
+                    key
                 );
             }
-
             previous_key = Some(key);
         }
+
+        // 2) Cross-check against GenericBinaryArray comparison
+        let array: GenericBinaryArray<i32> =
+            GenericBinaryArray::from(test_inputs.iter().map(|s| Some(*s)).collect::<Vec<_>>());
+
+        for i in 0..array.len() - 1 {
+            let v1 = array.value(i);
+            let v2 = array.value(i + 1);
+            // Ensure lexical ordering matches
+            assert!(v1 < v2, "Array compare failed: {:?} !< {:?}", v1, v2);
+            // Ensure fast key compare matches
+            let key1 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
+                v1.len() as u32,
+                v1,
+            ));
+            let key2 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
+                v2.len() as u32,
+                v2,
+            ));
+            assert!(
+                key1 < key2,
+                "Key compare failed: key({:?})=0x{:032x} !< key({:?})=0x{:032x}",
+                v1,
+                key1,
+                v2,
+                key2
+            );
+        }
     }
 }

From 5f80dc747197b0c41b0454e6193811637bebcc04 Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Fri, 27 Jun 2025 23:13:00 +0800
Subject: [PATCH 16/18] fix doc

---
 arrow-array/src/array/byte_view_array.rs | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs
index 398e39356bfb..e92d3100b003 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -605,22 +605,6 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
     /// key("bar\0") = 0x0000000000000000000062617200000004
     /// ⇒ key("bar") < key("bar\0")
     /// ```
-    ///
-    /// Background (naive implementation, two-pass):
-    ///
-    ///     fn compare_strings(a: &RawInline, b: &RawInline) -> Ordering {
-    ///         // 1) Compare up to 12 bytes of content
-    ///         let ord = compare_u96(a.inline_bytes_be(), b.inline_bytes_be());
-    ///         if ord != Ordering::Equal {
-    ///             return ord;
-    ///         }
-    ///         // 2) Tiebreaker: compare lengths
-    ///         a.len().cmp(&b.len())
-    ///     }
-    ///
-    /// Folding content and length into one `u128` lets you do:
-    ///
-    ///     key(a).cmp(&key(b))
     #[inline(always)]
     pub fn inline_key_fast(raw: u128) -> u128 {
         // Convert the raw u128 (little-endian) into bytes for manipulation

From 918a78965846642e4469e276385c7e5eb8e62151 Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Fri, 27 Jun 2025 23:22:00 +0800
Subject: [PATCH 17/18] fix clippy

---
 arrow-array/src/array/byte_view_array.rs | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs
index e92d3100b003..f438c335a412 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -1208,10 +1208,7 @@ mod tests {
             if let Some(prev) = previous_key {
                 assert!(
                     prev < key,
-                    "Key for {:?} (0x{:032x}) was not less than next key (0x{:032x})",
-                    input,
-                    prev,
-                    key
+                    "Key for {input:?} (0x{prev:032x}) was not less than next key (0x{key:032x})",
                 );
             }
             previous_key = Some(key);
@@ -1225,7 +1222,7 @@ mod tests {
             let v1 = array.value(i);
             let v2 = array.value(i + 1);
             // Ensure lexical ordering matches
-            assert!(v1 < v2, "Array compare failed: {:?} !< {:?}", v1, v2);
+            assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}");
             // Ensure fast key compare matches
             let key1 = GenericByteViewArray::<BinaryViewType>::inline_key_fast(make_raw_inline(
                 v1.len() as u32,
@@ -1237,11 +1234,7 @@ mod tests {
             ));
             assert!(
                 key1 < key2,
-                "Key compare failed: key({:?})=0x{:032x} !< key({:?})=0x{:032x}",
-                v1,
-                key1,
-                v2,
-                key2
+                "Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}",
             );
         }
     }

From 153cf8596f79b04c67e05c1ca94796b39b174de8 Mon Sep 17 00:00:00 2001
From: zhuqi-lucas <821684824@qq.com>
Date: Sat, 28 Jun 2025 08:19:47 +0800
Subject: [PATCH 18/18] address comments

---
 arrow-array/src/array/byte_view_array.rs | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs
index f438c335a412..46fc8d9bd584 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -1200,21 +1200,7 @@ mod tests {
             b"xyz",
         ];
 
-        // 1) Monotonic key order: content then length
-        let mut previous_key: Option<u128> = None;
-        for input in &test_inputs {
-            let raw = make_raw_inline(input.len() as u32, input);
-            let key = GenericByteViewArray::<BinaryViewType>::inline_key_fast(raw);
-            if let Some(prev) = previous_key {
-                assert!(
-                    prev < key,
-                    "Key for {input:?} (0x{prev:032x}) was not less than next key (0x{key:032x})",
-                );
-            }
-            previous_key = Some(key);
-        }
-
-        // 2) Cross-check against GenericBinaryArray comparison
+        // Monotonic key order: content then length，and cross-check against GenericBinaryArray comparison
         let array: GenericBinaryArray<i32> =
             GenericBinaryArray::from(test_inputs.iter().map(|s| Some(*s)).collect::<Vec<_>>());