Skip to content

Commit 4fd7761

Browse files
committed
chore: Added comments
1 parent 6b76708 commit 4fd7761

File tree

1 file changed

+15
-1
lines changed

1 file changed

+15
-1
lines changed

arrow-cast/src/cast/run_array.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ pub(crate) fn run_end_encoded_cast<K: RunEndIndexType>(
3838
to_type: &DataType,
3939
cast_options: &CastOptions,
4040
) -> Result<ArrayRef, ArrowError> {
41+
// Fast-path dispatch: most physical types can reuse `runs_for_primitive`, the remainder fall
42+
// through to specialized implementations below.
43+
// Route to the most specialized helper for the physical layout of `array`.
4144
match array.data_type() {
4245
DataType::RunEndEncoded(_, _) => {
4346
let run_array = array
@@ -269,6 +272,7 @@ fn runs_for_boolean(array: &BooleanArray) -> (Vec<usize>, Vec<usize>) {
269272
let mut current_value = if current_valid { array.value(0) } else { false };
270273

271274
for idx in 1..len {
275+
// Treat a change in validity the same as a change in value so null boundaries are recorded.
272276
let valid = array.is_valid(idx);
273277
let mut boundary = false;
274278
if current_valid && valid {
@@ -309,6 +313,7 @@ fn runs_for_primitive<T: ArrowPrimitiveType>(
309313
let mut current = unsafe { *values.get_unchecked(0) };
310314
let mut idx = 1;
311315
while idx < len {
316+
// Attempt to advance in 16-byte chunks before falling back to scalar comparison.
312317
let boundary = scan_run_end::<T>(values, current, idx);
313318
if boundary == len {
314319
break;
@@ -394,6 +399,7 @@ fn runs_for_binary_like<T: Copy>(
394399
for idx in 1..len {
395400
let start = to_usize(offsets[idx]);
396401
let end = to_usize(offsets[idx + 1]);
402+
// Any difference in byte length or payload means a new run.
397403
if (end - start) != (current_end - current_start)
398404
|| values[start..end] != values[current_start..current_end]
399405
{
@@ -413,6 +419,7 @@ fn runs_for_binary_like<T: Copy>(
413419
let start = to_usize(offsets[idx]);
414420
let end = to_usize(offsets[idx + 1]);
415421
let (current_start, current_end) = current_range;
422+
// Keep reusing the current byte-range as long as both validity and payload match.
416423
if (end - start) != (current_end - current_start)
417424
|| values[start..end] != values[current_start..current_end]
418425
{
@@ -482,6 +489,7 @@ fn runs_for_fixed_size_binary(array: &FixedSizeBinaryArray) -> (Vec<usize>, Vec<
482489
for idx in 1..len {
483490
let start = idx * width;
484491
let slice = &values[start..start + width];
492+
// Width is constant, so a simple byte slice comparison suffices.
485493
if slice != current_slice {
486494
ensure_capacity(&mut run_boundaries, len);
487495
run_boundaries.push(idx);
@@ -538,6 +546,7 @@ fn runs_generic(array: &dyn Array) -> (Vec<usize>, Vec<usize>) {
538546
let mut current_data = array.slice(0, 1).to_data();
539547
for idx in 1..len {
540548
let next_data = array.slice(idx, 1).to_data();
549+
// Fallback for exotic types: compare `ArrayData` views directly.
541550
if current_data != next_data {
542551
ensure_capacity(&mut run_boundaries, len);
543552
run_boundaries.push(idx);
@@ -566,6 +575,7 @@ fn ensure_capacity(vec: &mut Vec<usize>, total_len: usize) {
566575

567576
fn finalize_runs(mut run_boundaries: Vec<usize>, len: usize) -> (Vec<usize>, Vec<usize>) {
568577
let mut values_indexes = Vec::with_capacity(run_boundaries.len() + 1);
578+
// Values array always pulls the first element of each run; index 0 is by definition a run start.
569579
values_indexes.push(0);
570580
values_indexes.extend_from_slice(&run_boundaries);
571581
run_boundaries.push(len);
@@ -579,6 +589,7 @@ fn scan_run_end<T: ArrowPrimitiveType>(
579589
start: usize,
580590
) -> usize {
581591
let element_size = std::mem::size_of::<T::Native>();
592+
// Only attempt the chunked search when the element size divides evenly into 16 bytes.
582593
if element_size <= 8 && 16 % element_size == 0 {
583594
let elements_per_chunk = 16 / element_size;
584595
return scan_run_end_chunk::<T>(values, current, start, elements_per_chunk, element_size);
@@ -601,6 +612,9 @@ fn scan_run_end_chunk<T: ArrowPrimitiveType>(
601612
}
602613

603614
let mut pattern_bytes = [0u8; 16];
615+
// Safety: `T::Native` is guaranteed by `ArrowPrimitiveType` to have a plain-old-data layout,
616+
// allowing the value to be viewed as raw bytes. We copy exactly `element_size` bytes, so the
617+
// slice built from `current` stays within bounds.
604618
unsafe {
605619
let value_bytes =
606620
std::slice::from_raw_parts(&current as *const T::Native as *const u8, element_size);
@@ -611,6 +625,7 @@ fn scan_run_end_chunk<T: ArrowPrimitiveType>(
611625
let pattern = u128::from_ne_bytes(pattern_bytes);
612626

613627
while idx + elements_per_chunk <= len {
628+
// SAFETY: pointer arithmetic stays within the backing slice; unaligned reads are allowed.
614629
let chunk = unsafe { (values.as_ptr().add(idx) as *const u128).read_unaligned() };
615630
if chunk != pattern {
616631
for offset in 0..elements_per_chunk {
@@ -619,7 +634,6 @@ fn scan_run_end_chunk<T: ArrowPrimitiveType>(
619634
return idx + offset;
620635
}
621636
}
622-
unreachable!("chunk mismatch without locating differing element");
623637
}
624638
idx += elements_per_chunk;
625639
}

0 commit comments

Comments
 (0)