Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Made Vec optional.
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao committed Sep 23, 2021
1 parent 61676cc commit 7fb641f
Show file tree
Hide file tree
Showing 9 changed files with 616 additions and 7 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ compute = ["strength_reduce", "multiversion", "lexical-core", "ahash"]
io_parquet = ["parquet2", "io_ipc", "base64", "futures"]
benchmarks = ["rand"]
simd = ["packed_simd"]
cache_aligned = []

[package.metadata.cargo-all-features]
skip_feature_sets = [
Expand Down
119 changes: 119 additions & 0 deletions src/alloc/alignment.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// NOTE: Below code is written for spatial/temporal prefetcher optimizations. Memory allocation
// should align well with usage pattern of cache access and block sizes on layers of storage levels from
// registers to non-volatile memory. These alignments are all cache aware alignments incorporated
// from [cuneiform](https://crates.io/crates/cuneiform) crate. This approach mimicks Intel TBB's
// cache_aligned_allocator which exploits cache locality and minimizes prefetch signals
// resulting in less round trip time between the layers of storage.
// For further info: https://software.intel.com/en-us/node/506094

// 32-bit architecture and things other than netburst microarchitecture are using 64 bytes.
/// Cache and allocation multiple alignment size
#[cfg(target_arch = "x86")]
pub const ALIGNMENT: usize = 1 << 6;

// Intel x86_64:
// L2D streamer from L1:
// Loads data or instructions from memory to the second-level cache. To use the streamer,
// organize the data or instructions in blocks of 128 bytes, aligned on 128 bytes.
// - https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
/// Cache and allocation multiple alignment size
#[cfg(target_arch = "x86_64")]
pub const ALIGNMENT: usize = 1 << 7;

// 24Kc:
// Data Line Size
// - https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MD00346-2B-24K-DTS-04.00.pdf
// - https://gitlab.e.foundation/e/devices/samsung/n7100/stable_android_kernel_samsung_smdk4412/commit/2dbac10263b2f3c561de68b4c369bc679352ccee
/// Cache and allocation multiple alignment size
#[cfg(target_arch = "mips")]
pub const ALIGNMENT: usize = 1 << 5;
/// Cache and allocation multiple alignment size
#[cfg(target_arch = "mips64")]
pub const ALIGNMENT: usize = 1 << 5;

// Defaults for powerpc
/// Cache and allocation multiple alignment size
#[cfg(target_arch = "powerpc")]
pub const ALIGNMENT: usize = 1 << 5;

// Defaults for the ppc 64
/// Cache and allocation multiple alignment size
#[cfg(target_arch = "powerpc64")]
pub const ALIGNMENT: usize = 1 << 6;

// e.g.: sifive
// - https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/riscv/sifive-l2-cache.txt#L41
// in general all of them are the same.
/// Cache and allocation multiple alignment size
#[cfg(target_arch = "riscv")]
pub const ALIGNMENT: usize = 1 << 6;

// This size is same across all hardware for this architecture.
// - https://docs.huihoo.com/doxygen/linux/kernel/3.7/arch_2s390_2include_2asm_2cache_8h.html
/// Cache and allocation multiple alignment size
#[cfg(target_arch = "s390x")]
pub const ALIGNMENT: usize = 1 << 8;

// This size is same across all hardware for this architecture.
// - https://docs.huihoo.com/doxygen/linux/kernel/3.7/arch_2sparc_2include_2asm_2cache_8h.html#a9400cc2ba37e33279bdbc510a6311fb4
/// Cache and allocation multiple alignment size
#[cfg(target_arch = "sparc")]
pub const ALIGNMENT: usize = 1 << 5;
/// Cache and allocation multiple alignment size
#[cfg(target_arch = "sparc64")]
pub const ALIGNMENT: usize = 1 << 6;

// On ARM cache line sizes are fixed. both v6 and v7.
// Need to add board specific or platform specific things later.
/// Cache and allocation multiple alignment size
#[cfg(target_arch = "thumbv6")]
pub const ALIGNMENT: usize = 1 << 5;
/// Cache and allocation multiple alignment size
#[cfg(target_arch = "thumbv7")]
pub const ALIGNMENT: usize = 1 << 5;

// Operating Systems cache size determines this.
// Currently no way to determine this without runtime inference.
/// Cache and allocation multiple alignment size
#[cfg(target_arch = "wasm32")]
pub const ALIGNMENT: usize = 1 << 6;

// Same as v6 and v7.
// List goes like that:
// Cortex A, M, R, ARM v7, v7-M, Krait and NeoverseN uses this size.
/// Cache and allocation multiple alignment size
#[cfg(target_arch = "arm")]
pub const ALIGNMENT: usize = 1 << 5;

// Combined from 4 sectors. Volta says 128.
// Prevent chunk optimizations better to go to the default size.
// If you have smaller data with less padded functionality then use 32 with force option.
// - https://devtalk.nvidia.com/default/topic/803600/variable-cache-line-width-/
/// Cache and allocation multiple alignment size
#[cfg(target_arch = "nvptx")]
pub const ALIGNMENT: usize = 1 << 7;
/// Cache and allocation multiple alignment size
#[cfg(target_arch = "nvptx64")]
pub const ALIGNMENT: usize = 1 << 7;

// This size is same across all hardware for this architecture.
/// Cache and allocation multiple alignment size
#[cfg(target_arch = "aarch64")]
pub const ALIGNMENT: usize = 1 << 6;
128 changes: 128 additions & 0 deletions src/alloc/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! Defines memory-related functions, such as allocate/deallocate/reallocate memory
//! regions, cache and allocation alignments.
use std::mem::size_of;
use std::ptr::NonNull;
use std::{
alloc::{handle_alloc_error, Layout},
sync::atomic::AtomicIsize,
};

use crate::types::NativeType;

mod alignment;

pub use alignment::ALIGNMENT;

// If this number is not zero after all objects have been `drop`, there is a memory leak
static mut ALLOCATIONS: AtomicIsize = AtomicIsize::new(0);

/// # Safety
/// This pointer may only be used to check if memory is allocated.
#[inline]
pub unsafe fn dangling<T: NativeType>() -> NonNull<T> {
NonNull::new_unchecked(ALIGNMENT as *mut T)
}

/// Allocates a cache-aligned memory region of `size` bytes with uninitialized values.
/// This is more performant than using [allocate_aligned_zeroed] when all bytes will have
/// an unknown or non-zero value and is semantically similar to `malloc`.
pub fn allocate_aligned<T: NativeType>(size: usize) -> NonNull<T> {
unsafe {
if size == 0 {
dangling()
} else {
let size = size * size_of::<T>();
ALLOCATIONS.fetch_add(size as isize, std::sync::atomic::Ordering::SeqCst);

let layout = Layout::from_size_align_unchecked(size, ALIGNMENT);
let raw_ptr = std::alloc::alloc(layout) as *mut T;
NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout))
}
}
}

/// Allocates a cache-aligned memory region of `size` bytes with `0` on all of them.
/// This is more performant than using [allocate_aligned] and setting all bytes to zero
/// and is semantically similar to `calloc`.
pub fn allocate_aligned_zeroed<T: NativeType>(size: usize) -> NonNull<T> {
unsafe {
if size == 0 {
dangling()
} else {
let size = size * size_of::<T>();
ALLOCATIONS.fetch_add(size as isize, std::sync::atomic::Ordering::SeqCst);

let layout = Layout::from_size_align_unchecked(size, ALIGNMENT);
let raw_ptr = std::alloc::alloc_zeroed(layout) as *mut T;
NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout))
}
}
}

/// Frees memory previously allocated by [`allocate_aligned_zeroed`] or [`allocate_aligned`].
/// # Safety
/// This function is sound iff:
///
/// * `ptr` was allocated by [`allocate_aligned_zeroed`] or [`allocate_aligned`]
/// * `size` must be the same size that was used to allocate that block of memory.
pub unsafe fn free_aligned<T: NativeType>(ptr: NonNull<T>, size: usize) {
if size != 0 {
let size = size * size_of::<T>();
ALLOCATIONS.fetch_sub(size as isize, std::sync::atomic::Ordering::SeqCst);
std::alloc::dealloc(
ptr.as_ptr() as *mut u8,
Layout::from_size_align_unchecked(size, ALIGNMENT),
);
}
}

/// Reallocates memory previously allocated by [`allocate_aligned_zeroed`] or [`allocate_aligned`].
/// # Safety
/// This function is sound iff `ptr` was previously allocated by `allocate_aligned` or `allocate_aligned_zeroed` for `old_size` items.
pub unsafe fn reallocate<T: NativeType>(
ptr: NonNull<T>,
old_size: usize,
new_size: usize,
) -> NonNull<T> {
if old_size == 0 {
return allocate_aligned(new_size);
}

if new_size == 0 {
free_aligned(ptr, old_size);
return dangling();
}
let old_size = old_size * size_of::<T>();
let new_size = new_size * size_of::<T>();

ALLOCATIONS.fetch_add(
new_size as isize - old_size as isize,
std::sync::atomic::Ordering::SeqCst,
);
let raw_ptr = std::alloc::realloc(
ptr.as_ptr() as *mut u8,
Layout::from_size_align_unchecked(old_size, ALIGNMENT),
new_size,
) as *mut T;
NonNull::new(raw_ptr).unwrap_or_else(|| {
handle_alloc_error(Layout::from_size_align_unchecked(new_size, ALIGNMENT))
})
}
7 changes: 6 additions & 1 deletion src/buffer/bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ use std::{ptr::NonNull, sync::Arc};

use crate::ffi;
use crate::types::NativeType;
#[cfg(feature = "cache_aligned")]
use crate::vec::AlignedVec as Vec;

/// Mode of deallocating memory regions
pub enum Deallocation {
Expand Down Expand Up @@ -89,7 +91,10 @@ impl<T: NativeType> Drop for Bytes<T> {
fn drop(&mut self) {
match &self.deallocation {
Deallocation::Native(capacity) => unsafe {
Vec::from_raw_parts(self.ptr.as_ptr(), self.len, *capacity);
#[cfg(feature = "cache_aligned")]
let _ = Vec::from_raw_parts(self.ptr, self.len, *capacity);
#[cfg(not(feature = "cache_aligned"))]
let _ = Vec::from_raw_parts(self.ptr.as_ptr(), self.len, *capacity);
},
// foreign interface knows how to deallocate itself.
Deallocation::Foreign(_) => (),
Expand Down
10 changes: 10 additions & 0 deletions src/buffer/immutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,16 @@ impl<T: NativeType> Buffer<T> {
MutableBuffer::from_len_zeroed(length).into()
}

/// Takes ownership of [`Vec`].
/// # Implementation
/// This function is `O(1)`
#[cfg(not(feature = "cache_aligned"))]
#[cfg_attr(docsrs, doc(cfg(not(feature = "cache_aligned"))))]
#[inline]
pub fn from_vec(data: Vec<T>) -> Self {
MutableBuffer::from_vec(data).into()
}

/// Auxiliary method to create a new Buffer
pub(crate) fn from_bytes(bytes: Bytes<T>) -> Self {
let length = bytes.len();
Expand Down
32 changes: 26 additions & 6 deletions src/buffer/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ use crate::trusted_len::TrustedLen;
use crate::types::{BitChunk, NativeType};

use super::bytes::{Bytes, Deallocation};
#[cfg(feature = "cache_aligned")]
use crate::vec::AlignedVec as Vec;

use super::immutable::Buffer;

/// A [`MutableBuffer`] is this crates' interface to store types that are byte-like such as `i32`.
/// It behaves like a [`Vec`], with the following differences:
/// * memory is allocated along cache lines and in multiple of 64 bytes.
/// * it can only hold types supported by the arrow format (`u8-u64`, `i8-i128`, `f32,f64` and [`crate::types::days_ms`])
/// It behaves like a [`Vec`] but can only hold types supported by the arrow format
/// (`u8-u64`, `i8-i128`, `f32,f64`, [`crate::types::days_ms`] and [`crate::types::months_days_ns`]).
/// When the feature `cache_aligned` is active, memory is allocated along cache lines and in multiple of 64 bytes.
/// A [`MutableBuffer`] can be converted to a [`Buffer`] via `.into`.
/// # Example
/// ```
Expand All @@ -28,6 +30,14 @@ pub struct MutableBuffer<T: NativeType> {
data: Vec<T>,
}

#[cfg(not(feature = "cache_aligned"))]
#[cfg_attr(docsrs, doc(cfg(not(feature = "cache_aligned"))))]
impl<T: NativeType> From<MutableBuffer<T>> for Vec<T> {
fn from(data: MutableBuffer<T>) -> Self {
data.data
}
}

impl<T: NativeType> std::fmt::Debug for MutableBuffer<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
std::fmt::Debug::fmt(&**self, f)
Expand Down Expand Up @@ -55,6 +65,14 @@ impl<T: NativeType> MutableBuffer<T> {
}
}

/// Takes ownership of [`Vec`].
#[cfg(not(feature = "cache_aligned"))]
#[cfg_attr(docsrs, doc(cfg(not(feature = "cache_aligned"))))]
#[inline]
pub fn from_vec(data: Vec<T>) -> Self {
Self { data }
}

/// Allocates a new [MutableBuffer] with `len` and capacity to be at least `len`
/// where data is zeroed.
/// # Example
Expand All @@ -68,9 +86,11 @@ impl<T: NativeType> MutableBuffer<T> {
/// ```
#[inline]
pub fn from_len_zeroed(len: usize) -> Self {
Self {
data: vec![T::default(); len],
}
#[cfg(not(feature = "cache_aligned"))]
let data = vec![T::default(); len];
#[cfg(feature = "cache_aligned")]
let data = Vec::from_len_zeroed(len);
Self { data }
}

/// Ensures that this buffer has at least `self.len + additional` bytes. This re-allocates iff
Expand Down
4 changes: 4 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,17 @@

#[macro_use]
pub mod array;
#[cfg(feature = "cache_aligned")]
mod alloc;
pub mod bitmap;
pub mod buffer;
mod endianess;
pub mod error;
pub mod scalar;
pub mod trusted_len;
pub mod types;
#[cfg(feature = "cache_aligned")]
mod vec;

#[cfg(feature = "compute")]
#[cfg_attr(docsrs, doc(cfg(feature = "compute")))]
Expand Down
Loading

0 comments on commit 7fb641f

Please sign in to comment.