diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index ec7983c303f0..55a3a4c0d678 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -466,7 +466,7 @@ impl From for GenericListArray< _ => unreachable!(), }; - let offsets = OffsetBuffer::from_lengths(std::iter::repeat_n(size, value.len())); + let offsets = OffsetBuffer::from_repeated_length(size, value.len()); Self { data_type: Self::DATA_TYPE_CONSTRUCTOR(field.clone()), diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index fe3a57a38248..66fa7dd22ec5 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -112,6 +112,9 @@ impl OffsetBuffer { /// assert_eq!(offsets.as_ref(), &[0, 1, 4, 9]); /// ``` /// + /// If you want to create an [`OffsetBuffer`] where all lengths are the same, + /// consider using the faster [`OffsetBuffer::from_repeated_length`] instead. + /// /// # Panics /// /// Panics on overflow @@ -133,6 +136,43 @@ impl OffsetBuffer { Self(out.into()) } + /// Create a new [`OffsetBuffer`] where each slice has the same length + /// `length`, repeated `n` times. + /// + /// + /// Example + /// ``` + /// # use arrow_buffer::OffsetBuffer; + /// let offsets = OffsetBuffer::::from_repeated_length(4, 3); + /// assert_eq!(offsets.as_ref(), &[0, 4, 8, 12]); + /// ``` + /// + /// # Panics + /// + /// Panics on overflow + pub fn from_repeated_length(length: usize, n: usize) -> Self { + if n == 0 { + return Self::new_empty(); + } + + if length == 0 { + return Self::new_zeroed(n); + } + + // Check for overflow + // Making sure we don't overflow usize or O when calculating the total length + length.checked_mul(n).expect("usize overflow"); + + // Check for overflow + O::from_usize(length * n).expect("offset overflow"); + + let offsets = (0..=n) + .map(|index| O::usize_as(index * length)) + .collect::>(); + + Self(ScalarBuffer::from(offsets)) + } + /// Get an Iterator over the lengths of this [`OffsetBuffer`] /// /// ``` @@ -283,6 +323,36 @@ mod tests { OffsetBuffer::::from_lengths([usize::MAX, 1]); } + #[test] + #[should_panic(expected = "offset overflow")] + fn from_repeated_lengths_offset_length_overflow() { + OffsetBuffer::::from_repeated_length(i32::MAX as usize / 4, 5); + } + + #[test] + #[should_panic(expected = "offset overflow")] + fn from_repeated_lengths_offset_repeat_overflow() { + OffsetBuffer::::from_repeated_length(1, i32::MAX as usize + 1); + } + + #[test] + #[should_panic(expected = "offset overflow")] + fn from_repeated_lengths_usize_length_overflow() { + OffsetBuffer::::from_repeated_length(usize::MAX, 1); + } + + #[test] + #[should_panic(expected = "usize overflow")] + fn from_repeated_lengths_usize_length_usize_overflow() { + OffsetBuffer::::from_repeated_length(usize::MAX, 2); + } + + #[test] + #[should_panic(expected = "offset overflow")] + fn from_repeated_lengths_usize_repeat_overflow() { + OffsetBuffer::::from_repeated_length(1, usize::MAX); + } + #[test] fn get_lengths() { let offsets = OffsetBuffer::::new(ScalarBuffer::::from(vec![0, 1, 4, 9])); @@ -323,4 +393,76 @@ mod tests { let default = OffsetBuffer::::default(); assert_eq!(default.as_ref(), &[0]); } + + #[test] + fn from_repeated_length_basic() { + // Basic case with length 4, repeated 3 times + let buffer = OffsetBuffer::::from_repeated_length(4, 3); + assert_eq!(buffer.as_ref(), &[0, 4, 8, 12]); + + // Verify the lengths are correct + let lengths: Vec = buffer.lengths().collect(); + assert_eq!(lengths, vec![4, 4, 4]); + } + + #[test] + fn from_repeated_length_single_repeat() { + // Length 5, repeated once + let buffer = OffsetBuffer::::from_repeated_length(5, 1); + assert_eq!(buffer.as_ref(), &[0, 5]); + + let lengths: Vec = buffer.lengths().collect(); + assert_eq!(lengths, vec![5]); + } + + #[test] + fn from_repeated_length_zero_repeats() { + let buffer = OffsetBuffer::::from_repeated_length(10, 0); + assert_eq!(buffer, OffsetBuffer::::new_empty()); + } + + #[test] + fn from_repeated_length_zero_length() { + // Zero length, repeated 5 times (all zeros) + let buffer = OffsetBuffer::::from_repeated_length(0, 5); + assert_eq!(buffer.as_ref(), &[0, 0, 0, 0, 0, 0]); + + // All lengths should be 0 + let lengths: Vec = buffer.lengths().collect(); + assert_eq!(lengths, vec![0, 0, 0, 0, 0]); + } + + #[test] + fn from_repeated_length_large_values() { + // Test with larger values that don't overflow + let buffer = OffsetBuffer::::from_repeated_length(1000, 100); + assert_eq!(buffer[0], 0); + + // Verify all lengths are 1000 + let lengths: Vec = buffer.lengths().collect(); + assert_eq!(lengths.len(), 100); + assert!(lengths.iter().all(|&len| len == 1000)); + } + + #[test] + fn from_repeated_length_unit_length() { + // Length 1, repeated multiple times + let buffer = OffsetBuffer::::from_repeated_length(1, 10); + assert_eq!(buffer.as_ref(), &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + + let lengths: Vec = buffer.lengths().collect(); + assert_eq!(lengths, vec![1; 10]); + } + + #[test] + fn from_repeated_length_max_safe_values() { + // Test with maximum safe values for i32 + // i32::MAX / 3 ensures we don't overflow when repeated twice + let third_max = (i32::MAX / 3) as usize; + let buffer = OffsetBuffer::::from_repeated_length(third_max, 2); + assert_eq!( + buffer.as_ref(), + &[0, third_max as i32, (third_max * 2) as i32] + ); + } }