Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce Row format backed by raw bytes #1782

Merged
merged 9 commits into from
Feb 10, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,7 @@ members = [
[profile.release]
lto = true
codegen-units = 1

[patch.crates-io]
arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "731e132489b99cd688f884642cf20de52aed24d0" }
parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "731e132489b99cd688f884642cf20de52aed24d0" }
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Relies on apache/arrow-rs@e375bba, will remove this once we have arrow 9.0.1 released.

2 changes: 2 additions & 0 deletions datafusion/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,8 @@ pub use arrow;
pub use parquet;

pub(crate) mod field_util;
#[allow(dead_code)]
yjshen marked this conversation as resolved.
Show resolved Hide resolved
pub(crate) mod row;

#[cfg(feature = "pyarrow")]
mod pyarrow;
Expand Down
132 changes: 132 additions & 0 deletions datafusion/src/row/bitmap/fmt.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::fmt::Write;

use super::is_set;

/// Formats `bytes` taking into account an offset and length of the form
yjshen marked this conversation as resolved.
Show resolved Hide resolved
pub fn fmt(
bytes: &[u8],
offset: usize,
length: usize,
f: &mut std::fmt::Formatter<'_>,
) -> std::fmt::Result {
yjshen marked this conversation as resolved.
Show resolved Hide resolved
assert!(offset < 8);
yjshen marked this conversation as resolved.
Show resolved Hide resolved

f.write_char('[')?;
let mut remaining = length;
if remaining == 0 {
f.write_char(']')?;
return Ok(());
}

let first = bytes[0];
let bytes = &bytes[1..];
let empty_before = 8usize.saturating_sub(remaining + offset);
f.write_str("0b")?;
for _ in 0..empty_before {
f.write_char('_')?;
}
let until = std::cmp::min(8, offset + remaining);
for i in offset..until {
yjshen marked this conversation as resolved.
Show resolved Hide resolved
if is_set(first, offset + until - 1 - i) {
f.write_char('1')?;
} else {
f.write_char('0')?;
}
}
for _ in 0..offset {
f.write_char('_')?;
}
remaining -= until - offset;

if remaining == 0 {
f.write_char(']')?;
return Ok(());
}

let number_of_bytes = remaining / 8;
for byte in &bytes[..number_of_bytes] {
f.write_str(", ")?;
f.write_fmt(format_args!("{:#010b}", byte))?;
}
remaining -= number_of_bytes * 8;
if remaining == 0 {
f.write_char(']')?;
return Ok(());
}

let last = bytes[std::cmp::min((length + offset + 7) / 8, bytes.len() - 1)];
let remaining = (length + offset) % 8;
f.write_str(", ")?;
f.write_str("0b")?;
for _ in 0..(8 - remaining) {
f.write_char('_')?;
}
for i in 0..remaining {
yjshen marked this conversation as resolved.
Show resolved Hide resolved
if is_set(last, remaining - 1 - i) {
f.write_char('1')?;
} else {
f.write_char('0')?;
}
}
f.write_char(']')
}

#[cfg(test)]
mod tests {
use super::*;

struct A<'a>(&'a [u8], usize, usize);
impl<'a> std::fmt::Debug for A<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
fmt(self.0, self.1, self.2, f)
}
}

#[test]
fn test_debug() -> std::fmt::Result {
assert_eq!(format!("{:?}", A(&[1], 0, 0)), "[]");
yjshen marked this conversation as resolved.
Show resolved Hide resolved
assert_eq!(format!("{:?}", A(&[0b11000001], 0, 8)), "[0b11000001]");
assert_eq!(
format!("{:?}", A(&[0b11000001, 1], 0, 9)),
"[0b11000001, 0b_______1]"
);
assert_eq!(format!("{:?}", A(&[1], 0, 2)), "[0b______01]");
assert_eq!(format!("{:?}", A(&[1], 1, 2)), "[0b_____00_]");
assert_eq!(format!("{:?}", A(&[1], 2, 2)), "[0b____00__]");
assert_eq!(format!("{:?}", A(&[1], 3, 2)), "[0b___00___]");
assert_eq!(format!("{:?}", A(&[1], 4, 2)), "[0b__00____]");
assert_eq!(format!("{:?}", A(&[1], 5, 2)), "[0b_00_____]");
assert_eq!(format!("{:?}", A(&[1], 6, 2)), "[0b00______]");
assert_eq!(
format!("{:?}", A(&[0b11000001, 1], 1, 9)),
"[0b1100000_, 0b______01]"
);
// extra bytes are ignored
assert_eq!(
format!("{:?}", A(&[0b11000001, 1, 1, 1], 1, 9)),
"[0b1100000_, 0b______01]"
);
assert_eq!(
format!("{:?}", A(&[0b11000001, 1, 1], 2, 16)),
"[0b110000__, 0b00000001, 0b______01]"
);
Ok(())
}
}
126 changes: 126 additions & 0 deletions datafusion/src/row/bitmap/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! General utilities for null bit section handling
//!
//! Note: this is a tailored version based on [arrow2 bitmap utils](https://github.com/jorgecarleitao/arrow2/tree/main/src/bitmap/utils)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW this appears to itself be a copy of https://docs.rs/arrow/latest/arrow/util/bit_util/index.html

Copy link
Member Author

@yjshen yjshen Feb 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The bitmap is rewritten on top of arrow/util/bit_util, along with a much-simplified version of fmt.


mod fmt;

pub use fmt::fmt;

const BIT_MASK: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128];
const UNSET_BIT_MASK: [u8; 8] = [
255 - 1,
255 - 2,
255 - 4,
255 - 8,
255 - 16,
255 - 32,
255 - 64,
255 - 128,
];
const ALL_VALID_MASK: [u8; 8] = [1, 3, 7, 15, 31, 63, 127, 255];

/// Returns whether bit at position `i` in `byte` is set or not
#[inline]
pub fn is_set(byte: u8, i: usize) -> bool {
(byte & BIT_MASK[i]) != 0
}

/// Sets bit at position `i` in `byte`
#[inline]
pub fn set(byte: u8, i: usize, value: bool) -> u8 {
if value {
byte | BIT_MASK[i]
} else {
byte & UNSET_BIT_MASK[i]
}
}

/// Sets bit at position `i` in `data`
#[inline]
pub fn set_bit(data: &mut [u8], i: usize, value: bool) {
data[i / 8] = set(data[i / 8], i % 8, value);
}

/// Returns whether bit at position `i` in `data` is set or not.
///
/// # Safety
/// `i >= data.len() * 8` results in undefined behavior
#[inline]
pub unsafe fn get_bit_unchecked(data: &[u8], i: usize) -> bool {
(*data.as_ptr().add(i >> 3) & BIT_MASK[i & 7]) != 0
}

/// Returns the number of bytes required to hold `bits` bits.
#[inline]
pub fn bytes_for(bits: usize) -> usize {
bits.saturating_add(7) / 8
yjshen marked this conversation as resolved.
Show resolved Hide resolved
}

/// Returns if all fields are valid
pub fn all_valid(data: &[u8], n: usize) -> bool {
for item in data.iter().take(n / 8) {
if *item != ALL_VALID_MASK[7] {
return false;
}
}
if n % 8 == 0 {
true
} else {
data[n / 8] == ALL_VALID_MASK[n % 8 - 1]
}
}

#[cfg(test)]
mod tests {
use super::*;
use rand::Rng;

fn test_validity(bs: &[bool]) {
let mut data = vec![0; bytes_for(bs.len())];
for (i, b) in bs.iter().enumerate() {
set_bit(&mut data, i, *b);
}
let expected = bs.iter().all(|f| *f);
assert_eq!(all_valid(&data, bs.len()), expected);
}

#[test]
fn test_all_valid() {
let sizes = [4, 8, 12, 16, 19, 23, 32, 44];
for i in sizes {
{
// contains false
let input = {
let mut rng = rand::thread_rng();
let mut input: Vec<bool> = vec![false; i];
rng.fill(&mut input[..]);
input
};
test_validity(&input);
}

{
// all true
let input = vec![true; i];
test_validity(&input);
}
}
}
}
Loading