Skip to content

Commit

Permalink
Merge 63beecd into 719096b
Browse files Browse the repository at this point in the history
  • Loading branch information
jhorstmann authored Jan 10, 2022
2 parents 719096b + 63beecd commit bb99dab
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 55 deletions.
14 changes: 13 additions & 1 deletion arrow/benches/comparison_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use criterion::Criterion;
extern crate arrow;

use arrow::compute::*;
use arrow::datatypes::ArrowNumericType;
use arrow::datatypes::{ArrowNumericType, IntervalMonthDayNanoType};
use arrow::util::bench_util::*;
use arrow::{array::*, datatypes::Float32Type};

Expand Down Expand Up @@ -138,6 +138,11 @@ fn add_benchmark(c: &mut Criterion) {
let arr_a = create_primitive_array_with_seed::<Float32Type>(size, 0.0, 42);
let arr_b = create_primitive_array_with_seed::<Float32Type>(size, 0.0, 43);

let arr_month_day_nano_a =
create_primitive_array_with_seed::<IntervalMonthDayNanoType>(size, 0.0, 43);
let arr_month_day_nano_b =
create_primitive_array_with_seed::<IntervalMonthDayNanoType>(size, 0.0, 43);

let arr_string = create_string_array::<i32>(size, 0.0);

c.bench_function("eq Float32", |b| b.iter(|| bench_eq(&arr_a, &arr_b)));
Expand Down Expand Up @@ -170,6 +175,13 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_gt_eq_scalar(&arr_a, 1.0))
});

c.bench_function("eq MonthDayNano", |b| {
b.iter(|| bench_eq(&arr_month_day_nano_a, &arr_month_day_nano_b))
});
c.bench_function("eq scalar MonthDayNano", |b| {
b.iter(|| bench_eq_scalar(&arr_month_day_nano_a, 123))
});

c.bench_function("like_utf8 scalar equals", |b| {
b.iter(|| bench_like_utf8_scalar(&arr_string, "xxxx"))
});
Expand Down
122 changes: 70 additions & 52 deletions arrow/src/compute/kernels/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1471,53 +1471,65 @@ where

let null_bit_buffer = combine_option_bitmap(left.data_ref(), right.data_ref(), len)?;

// we process the data in chunks so that each iteration results in one u64 of comparison result bits
const CHUNK_SIZE: usize = 64;
let lanes = T::lanes();

// this is currently the case for all our datatypes and allows us to always append full bytes
assert!(
lanes <= CHUNK_SIZE,
"Number of vector lanes must be at most 64"
);

let buffer_size = bit_util::ceil(len, 8);
let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false);

// this is currently the case for all our datatypes and allows us to always append full bytes
assert_eq!(lanes % 8, 0, "Number of vector lanes must be multiple of 8");
let mut left_chunks = left.values().chunks_exact(lanes);
let mut right_chunks = right.values().chunks_exact(lanes);
let mut left_chunks = left.values().chunks_exact(CHUNK_SIZE);
let mut right_chunks = right.values().chunks_exact(CHUNK_SIZE);

// safety: result is newly created above, always written as a T below
let result_chunks = unsafe { result.typed_data_mut() };
let result_remainder = left_chunks
.borrow_mut()
.zip(right_chunks.borrow_mut())
.fold(result_chunks, |result_slice, (left_slice, right_slice)| {
let simd_left = T::load(left_slice);
let simd_right = T::load(right_slice);
let simd_result = simd_op(simd_left, simd_right);
let mut i = 0;
let mut bitmask = 0_u64;
while i < CHUNK_SIZE {
let simd_left = T::load(&left_slice[i..]);
let simd_right = T::load(&right_slice[i..]);
let simd_result = simd_op(simd_left, simd_right);

let bitmask = T::mask_to_u64(&simd_result);
let m = T::mask_to_u64(&simd_result);
bitmask |= m << (i / lanes);

i += lanes;
}
let bytes = bitmask.to_le_bytes();
result_slice[0..lanes / 8].copy_from_slice(&bytes[0..lanes / 8]);
result_slice[0..8].copy_from_slice(&bytes);

&mut result_slice[lanes / 8..]
&mut result_slice[8..]
});

let left_remainder = left_chunks.remainder();
let right_remainder = right_chunks.remainder();

assert_eq!(left_remainder.len(), right_remainder.len());

let remainder_bitmask = left_remainder
.iter()
.zip(right_remainder.iter())
.enumerate()
.fold(0_u64, |mut mask, (i, (scalar_left, scalar_right))| {
let bit = if scalar_op(*scalar_left, *scalar_right) {
1_u64
} else {
0_u64
};
mask |= bit << i;
mask
});
let remainder_mask_as_bytes =
&remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)];
result_remainder.copy_from_slice(remainder_mask_as_bytes);
if !left_remainder.is_empty() {
let remainder_bitmask = left_remainder
.iter()
.zip(right_remainder.iter())
.enumerate()
.fold(0_u64, |mut mask, (i, (scalar_left, scalar_right))| {
let bit = scalar_op(*scalar_left, *scalar_right) as u64;
mask |= bit << i;
mask
});
let remainder_mask_as_bytes =
&remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)];
result_remainder.copy_from_slice(remainder_mask_as_bytes);
}

let data = unsafe {
ArrayData::new_unchecked(
Expand Down Expand Up @@ -1551,16 +1563,20 @@ where

let len = left.len();

// we process the data in chunks so that each iteration results in one u64 of comparison result bits
const CHUNK_SIZE: usize = 64;
let lanes = T::lanes();
let buffer_size = bit_util::ceil(len, 8);
let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false);

// this is currently the case for all our datatypes and allows us to always append full bytes
assert!(
lanes % 8 == 0,
"Number of vector lanes must be multiple of 8"
lanes <= CHUNK_SIZE,
"Number of vector lanes must be at most 64"
);
let mut left_chunks = left.values().chunks_exact(lanes);

let buffer_size = bit_util::ceil(len, 8);
let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false);

let mut left_chunks = left.values().chunks_exact(CHUNK_SIZE);
let simd_right = T::init(right);

// safety: result is newly created above, always written as a T below
Expand All @@ -1569,34 +1585,38 @@ where
left_chunks
.borrow_mut()
.fold(result_chunks, |result_slice, left_slice| {
let simd_left = T::load(left_slice);
let simd_result = simd_op(simd_left, simd_right);
let mut i = 0;
let mut bitmask = 0_u64;
while i < CHUNK_SIZE {
let simd_left = T::load(&left_slice[i..]);
let simd_result = simd_op(simd_left, simd_right);

let m = T::mask_to_u64(&simd_result);
bitmask |= m << (i / lanes);

let bitmask = T::mask_to_u64(&simd_result);
i += lanes;
}
let bytes = bitmask.to_le_bytes();
result_slice[0..lanes / 8].copy_from_slice(&bytes[0..lanes / 8]);
result_slice[0..8].copy_from_slice(&bytes);

&mut result_slice[lanes / 8..]
&mut result_slice[8..]
});

let left_remainder = left_chunks.remainder();

let remainder_bitmask =
left_remainder
.iter()
.enumerate()
.fold(0_u64, |mut mask, (i, scalar_left)| {
let bit = if scalar_op(*scalar_left, right) {
1_u64
} else {
0_u64
};
if !left_remainder.is_empty() {
let remainder_bitmask = left_remainder.iter().enumerate().fold(
0_u64,
|mut mask, (i, scalar_left)| {
let bit = scalar_op(*scalar_left, right) as u64;
mask |= bit << i;
mask
});
let remainder_mask_as_bytes =
&remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)];
result_remainder.copy_from_slice(remainder_mask_as_bytes);
},
);
let remainder_mask_as_bytes =
&remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)];
result_remainder.copy_from_slice(remainder_mask_as_bytes);
}

let null_bit_buffer = left
.data_ref()
Expand Down Expand Up @@ -2723,8 +2743,6 @@ mod tests {
);
}

// Fails when simd is enabled: https://github.com/apache/arrow-rs/issues/1136
#[cfg(not(feature = "simd"))]
#[test]
fn test_interval_array() {
let a = IntervalDayTimeArray::from(
Expand Down
29 changes: 27 additions & 2 deletions arrow/src/datatypes/numeric.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,20 @@ macro_rules! make_numeric_type {
// this match will get removed by the compiler since the number of lanes is known at
// compile-time for each concrete numeric type
match Self::lanes() {
4 => {
// the bit position in each lane indicates the index of that lane
let vecidx = i128x4::new(1, 2, 4, 8);

// broadcast the lowermost 8 bits of mask to each lane
let vecmask = i128x4::splat((mask & 0x0F) as i128);
// compute whether the bit corresponding to each lanes index is set
let vecmask = (vecidx & vecmask).eq(vecidx);

// transmute is necessary because the different match arms return different
// mask types, at runtime only one of those expressions will exist per type,
// with the type being equal to `SimdMask`.
unsafe { std::mem::transmute(vecmask) }
}
8 => {
// the bit position in each lane indicates the index of that lane
let vecidx = i64x8::new(1, 2, 4, 8, 16, 32, 64, 128);
Expand Down Expand Up @@ -448,11 +462,11 @@ macro_rules! make_float_numeric_type {
make_float_numeric_type!(Float32Type, f32x16);
make_float_numeric_type!(Float64Type, f64x8);

#[cfg(all(test, simd_x86))]
#[cfg(all(test, feature = "simd"))]
mod tests {
use crate::datatypes::{
ArrowNumericType, Float32Type, Float64Type, Int32Type, Int64Type, Int8Type,
UInt16Type,
IntervalMonthDayNanoType, UInt16Type,
};
use packed_simd::*;
use FromCast;
Expand All @@ -470,6 +484,17 @@ mod tests {
}};
}

#[test]
fn test_mask_i128() {
let mask = 0b1101;
let actual = IntervalMonthDayNanoType::mask_from_u64(mask);
let expected = expected_mask!(i128, mask);
let expected =
m128x4::from_cast(i128x4::from_slice_unaligned(expected.as_slice()));

assert_eq!(expected, actual);
}

#[test]
fn test_mask_f64() {
let mask = 0b10101010;
Expand Down

0 comments on commit bb99dab

Please sign in to comment.