Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions arrow-buffer/src/buffer/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,30 @@ pub fn bitwise_bin_op_helper<F>(
where
F: FnMut(u64, u64) -> u64,
{
// If the underlying buffers are aligned to u64 we can apply the operation directly on the u64 slices
// to improve performance.
if left_offset_in_bits == 0 && right_offset_in_bits == 0 {
unsafe {
let (left_prefix, left_u64s, left_suffix) = left.as_slice().align_to::<u64>();
let (right_prefix, right_u64s, right_suffix) = right.as_slice().align_to::<u64>();
// if there is no prefix or suffix, both buffers are aligned and we can do the operation directly
// on u64s
// TODO also handle non empty suffixes by processing them separately
if left_prefix.is_empty()
&& right_prefix.is_empty()
&& left_suffix.is_empty()
&& right_suffix.is_empty()
Comment on lines +83 to +86
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even if there is a prefix/suffix could you do u64 operations on the aligned portion and fallback to bitwise operations on the unaligned portion?

That being said, this seems like a fine optimization on its own.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is a good point (as long as the prefix and suffixes are the same length)

{
let result_u64s = left_u64s
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am pretty excited to see how much this actually helps with performance. This code should vectorize pretty spectacularly

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TLDR: 30-50% faster 😎

.iter()
.zip(right_u64s.iter())
.map(|(l, r)| op(*l, *r))
.collect::<Vec<u64>>();
return result_u64s.into();
}
}
}

let left_chunks = left.bit_chunks(left_offset_in_bits, len_in_bits);
let right_chunks = right.bit_chunks(right_offset_in_bits, len_in_bits);

Expand Down Expand Up @@ -102,6 +126,21 @@ pub fn bitwise_unary_op_helper<F>(
where
F: FnMut(u64) -> u64,
{
// If the underlying buffer is aligned to u64, apply the operation directly on the u64 slices
// to improve performance.
if offset_in_bits == 0 && len_in_bits > 0 {
unsafe {
let (prefix, u64s, suffix) = left.as_slice().align_to::<u64>();
// if there is no prefix or suffix, the buffer is aligned and we can do the operation directly
// on u64s
// TODO also handle non empty suffixes by processing them separately
if prefix.is_empty() && suffix.is_empty() {
let result_u64s = u64s.iter().map(|l| op(*l)).collect::<Vec<u64>>();
return result_u64s.into();
}
}
}

// reserve capacity and set length so we can get a typed view of u64 chunks
let mut result =
MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 * 8, false);
Expand Down
3 changes: 2 additions & 1 deletion arrow-buffer/src/util/bit_chunk_iterator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,8 @@ impl<'a> BitChunks<'a> {
pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
assert!(
ceil(offset + len, 8) <= buffer.len(),
"offset + len out of bounds"
"offset + len out of bounds. Buffer length in bits: {}, requested offset: {offset}, len: {len}",
buffer.len(),
);

let byte_offset = offset / 8;
Expand Down
Loading