Made Vec optional.

jorgecarleitao · Sep 23, 2021 · 7fb641f · 7fb641f
1 parent 61676cc
commit 7fb641f
Show file tree

Hide file tree

Showing 9 changed files with 616 additions and 7 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -125,6 +125,7 @@ compute = ["strength_reduce", "multiversion", "lexical-core", "ahash"]
 io_parquet = ["parquet2", "io_ipc", "base64", "futures"]
 benchmarks = ["rand"]
 simd = ["packed_simd"]
+cache_aligned = []
 
 [package.metadata.cargo-all-features]
 skip_feature_sets = [

diff --git a/src/alloc/alignment.rs b/src/alloc/alignment.rs
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: Below code is written for spatial/temporal prefetcher optimizations. Memory allocation
+// should align well with usage pattern of cache access and block sizes on layers of storage levels from
+// registers to non-volatile memory. These alignments are all cache aware alignments incorporated
+// from [cuneiform](https://crates.io/crates/cuneiform) crate. This approach mimicks Intel TBB's
+// cache_aligned_allocator which exploits cache locality and minimizes prefetch signals
+// resulting in less round trip time between the layers of storage.
+// For further info: https://software.intel.com/en-us/node/506094
+
+// 32-bit architecture and things other than netburst microarchitecture are using 64 bytes.
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "x86")]
+pub const ALIGNMENT: usize = 1 << 6;
+
+// Intel x86_64:
+// L2D streamer from L1:
+// Loads data or instructions from memory to the second-level cache. To use the streamer,
+// organize the data or instructions in blocks of 128 bytes, aligned on 128 bytes.
+// - https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "x86_64")]
+pub const ALIGNMENT: usize = 1 << 7;
+
+// 24Kc:
+// Data Line Size
+// - https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MD00346-2B-24K-DTS-04.00.pdf
+// - https://gitlab.e.foundation/e/devices/samsung/n7100/stable_android_kernel_samsung_smdk4412/commit/2dbac10263b2f3c561de68b4c369bc679352ccee
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "mips")]
+pub const ALIGNMENT: usize = 1 << 5;
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "mips64")]
+pub const ALIGNMENT: usize = 1 << 5;
+
+// Defaults for powerpc
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "powerpc")]
+pub const ALIGNMENT: usize = 1 << 5;
+
+// Defaults for the ppc 64
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "powerpc64")]
+pub const ALIGNMENT: usize = 1 << 6;
+
+// e.g.: sifive
+// - https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/riscv/sifive-l2-cache.txt#L41
+// in general all of them are the same.
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "riscv")]
+pub const ALIGNMENT: usize = 1 << 6;
+
+// This size is same across all hardware for this architecture.
+// - https://docs.huihoo.com/doxygen/linux/kernel/3.7/arch_2s390_2include_2asm_2cache_8h.html
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "s390x")]
+pub const ALIGNMENT: usize = 1 << 8;
+
+// This size is same across all hardware for this architecture.
+// - https://docs.huihoo.com/doxygen/linux/kernel/3.7/arch_2sparc_2include_2asm_2cache_8h.html#a9400cc2ba37e33279bdbc510a6311fb4
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "sparc")]
+pub const ALIGNMENT: usize = 1 << 5;
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "sparc64")]
+pub const ALIGNMENT: usize = 1 << 6;
+
+// On ARM cache line sizes are fixed. both v6 and v7.
+// Need to add board specific or platform specific things later.
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "thumbv6")]
+pub const ALIGNMENT: usize = 1 << 5;
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "thumbv7")]
+pub const ALIGNMENT: usize = 1 << 5;
+
+// Operating Systems cache size determines this.
+// Currently no way to determine this without runtime inference.
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "wasm32")]
+pub const ALIGNMENT: usize = 1 << 6;
+
+// Same as v6 and v7.
+// List goes like that:
+// Cortex A, M, R, ARM v7, v7-M, Krait and NeoverseN uses this size.
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "arm")]
+pub const ALIGNMENT: usize = 1 << 5;
+
+// Combined from 4 sectors. Volta says 128.
+// Prevent chunk optimizations better to go to the default size.
+// If you have smaller data with less padded functionality then use 32 with force option.
+// - https://devtalk.nvidia.com/default/topic/803600/variable-cache-line-width-/
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "nvptx")]
+pub const ALIGNMENT: usize = 1 << 7;
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "nvptx64")]
+pub const ALIGNMENT: usize = 1 << 7;
+
+// This size is same across all hardware for this architecture.
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "aarch64")]
+pub const ALIGNMENT: usize = 1 << 6;
diff --git a/src/alloc/mod.rs b/src/alloc/mod.rs
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines memory-related functions, such as allocate/deallocate/reallocate memory
+//! regions, cache and allocation alignments.
+
+use std::mem::size_of;
+use std::ptr::NonNull;
+use std::{
+    alloc::{handle_alloc_error, Layout},
+    sync::atomic::AtomicIsize,
+};
+
+use crate::types::NativeType;
+
+mod alignment;
+
+pub use alignment::ALIGNMENT;
+
+// If this number is not zero after all objects have been `drop`, there is a memory leak
+static mut ALLOCATIONS: AtomicIsize = AtomicIsize::new(0);
+
+/// # Safety
+/// This pointer may only be used to check if memory is allocated.
+#[inline]
+pub unsafe fn dangling<T: NativeType>() -> NonNull<T> {
+    NonNull::new_unchecked(ALIGNMENT as *mut T)
+}
+
+/// Allocates a cache-aligned memory region of `size` bytes with uninitialized values.
+/// This is more performant than using [allocate_aligned_zeroed] when all bytes will have
+/// an unknown or non-zero value and is semantically similar to `malloc`.
+pub fn allocate_aligned<T: NativeType>(size: usize) -> NonNull<T> {
+    unsafe {
+        if size == 0 {
+            dangling()
+        } else {
+            let size = size * size_of::<T>();
+            ALLOCATIONS.fetch_add(size as isize, std::sync::atomic::Ordering::SeqCst);
+
+            let layout = Layout::from_size_align_unchecked(size, ALIGNMENT);
+            let raw_ptr = std::alloc::alloc(layout) as *mut T;
+            NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout))
+        }
+    }
+}
+
+/// Allocates a cache-aligned memory region of `size` bytes with `0` on all of them.
+/// This is more performant than using [allocate_aligned] and setting all bytes to zero
+/// and is semantically similar to `calloc`.
+pub fn allocate_aligned_zeroed<T: NativeType>(size: usize) -> NonNull<T> {
+    unsafe {
+        if size == 0 {
+            dangling()
+        } else {
+            let size = size * size_of::<T>();
+            ALLOCATIONS.fetch_add(size as isize, std::sync::atomic::Ordering::SeqCst);
+
+            let layout = Layout::from_size_align_unchecked(size, ALIGNMENT);
+            let raw_ptr = std::alloc::alloc_zeroed(layout) as *mut T;
+            NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout))
+        }
+    }
+}
+
+/// Frees memory previously allocated by [`allocate_aligned_zeroed`] or [`allocate_aligned`].
+/// # Safety
+/// This function is sound iff:
+///
+/// * `ptr` was allocated by [`allocate_aligned_zeroed`] or [`allocate_aligned`]
+/// * `size` must be the same size that was used to allocate that block of memory.
+pub unsafe fn free_aligned<T: NativeType>(ptr: NonNull<T>, size: usize) {
+    if size != 0 {
+        let size = size * size_of::<T>();
+        ALLOCATIONS.fetch_sub(size as isize, std::sync::atomic::Ordering::SeqCst);
+        std::alloc::dealloc(
+            ptr.as_ptr() as *mut u8,
+            Layout::from_size_align_unchecked(size, ALIGNMENT),
+        );
+    }
+}
+
+/// Reallocates memory previously allocated by [`allocate_aligned_zeroed`] or [`allocate_aligned`].
+/// # Safety
+/// This function is sound iff `ptr` was previously allocated by `allocate_aligned` or `allocate_aligned_zeroed` for `old_size` items.
+pub unsafe fn reallocate<T: NativeType>(
+    ptr: NonNull<T>,
+    old_size: usize,
+    new_size: usize,
+) -> NonNull<T> {
+    if old_size == 0 {
+        return allocate_aligned(new_size);
+    }
+
+    if new_size == 0 {
+        free_aligned(ptr, old_size);
+        return dangling();
+    }
+    let old_size = old_size * size_of::<T>();
+    let new_size = new_size * size_of::<T>();
+
+    ALLOCATIONS.fetch_add(
+        new_size as isize - old_size as isize,
+        std::sync::atomic::Ordering::SeqCst,
+    );
+    let raw_ptr = std::alloc::realloc(
+        ptr.as_ptr() as *mut u8,
+        Layout::from_size_align_unchecked(old_size, ALIGNMENT),
+        new_size,
+    ) as *mut T;
+    NonNull::new(raw_ptr).unwrap_or_else(|| {
+        handle_alloc_error(Layout::from_size_align_unchecked(new_size, ALIGNMENT))
+    })
+}
diff --git a/src/buffer/bytes.rs b/src/buffer/bytes.rs
@@ -7,6 +7,8 @@ use std::{ptr::NonNull, sync::Arc};
 
 use crate::ffi;
 use crate::types::NativeType;
+#[cfg(feature = "cache_aligned")]
+use crate::vec::AlignedVec as Vec;
 
 /// Mode of deallocating memory regions
 pub enum Deallocation {
@@ -89,7 +91,10 @@ impl<T: NativeType> Drop for Bytes<T> {
     fn drop(&mut self) {
         match &self.deallocation {
             Deallocation::Native(capacity) => unsafe {
-                Vec::from_raw_parts(self.ptr.as_ptr(), self.len, *capacity);
+                #[cfg(feature = "cache_aligned")]
+                let _ = Vec::from_raw_parts(self.ptr, self.len, *capacity);
+                #[cfg(not(feature = "cache_aligned"))]
+                let _ = Vec::from_raw_parts(self.ptr.as_ptr(), self.len, *capacity);
             },
             // foreign interface knows how to deallocate itself.
             Deallocation::Foreign(_) => (),

diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs
@@ -52,6 +52,16 @@ impl<T: NativeType> Buffer<T> {
         MutableBuffer::from_len_zeroed(length).into()
     }
 
+    /// Takes ownership of [`Vec`].
+    /// # Implementation
+    /// This function is `O(1)`
+    #[cfg(not(feature = "cache_aligned"))]
+    #[cfg_attr(docsrs, doc(cfg(not(feature = "cache_aligned"))))]
+    #[inline]
+    pub fn from_vec(data: Vec<T>) -> Self {
+        MutableBuffer::from_vec(data).into()
+    }
+
     /// Auxiliary method to create a new Buffer
     pub(crate) fn from_bytes(bytes: Bytes<T>) -> Self {
         let length = bytes.len();

diff --git a/src/buffer/mutable.rs b/src/buffer/mutable.rs
@@ -6,13 +6,15 @@ use crate::trusted_len::TrustedLen;
 use crate::types::{BitChunk, NativeType};
 
 use super::bytes::{Bytes, Deallocation};
+#[cfg(feature = "cache_aligned")]
+use crate::vec::AlignedVec as Vec;
 
 use super::immutable::Buffer;
 
 /// A [`MutableBuffer`] is this crates' interface to store types that are byte-like such as `i32`.
-/// It behaves like a [`Vec`], with the following differences:
-/// * memory is allocated along cache lines and in multiple of 64 bytes.
-/// * it can only hold types supported by the arrow format (`u8-u64`, `i8-i128`, `f32,f64` and [`crate::types::days_ms`])
+/// It behaves like a [`Vec`] but can only hold types supported by the arrow format
+/// (`u8-u64`, `i8-i128`, `f32,f64`, [`crate::types::days_ms`] and [`crate::types::months_days_ns`]).
+/// When the feature `cache_aligned` is active, memory is allocated along cache lines and in multiple of 64 bytes.
 /// A [`MutableBuffer`] can be converted to a [`Buffer`] via `.into`.
 /// # Example
 /// ```
@@ -28,6 +30,14 @@ pub struct MutableBuffer<T: NativeType> {
     data: Vec<T>,
 }
 
+#[cfg(not(feature = "cache_aligned"))]
+#[cfg_attr(docsrs, doc(cfg(not(feature = "cache_aligned"))))]
+impl<T: NativeType> From<MutableBuffer<T>> for Vec<T> {
+    fn from(data: MutableBuffer<T>) -> Self {
+        data.data
+    }
+}
+
 impl<T: NativeType> std::fmt::Debug for MutableBuffer<T> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         std::fmt::Debug::fmt(&**self, f)
@@ -55,6 +65,14 @@ impl<T: NativeType> MutableBuffer<T> {
         }
     }
 
+    /// Takes ownership of [`Vec`].
+    #[cfg(not(feature = "cache_aligned"))]
+    #[cfg_attr(docsrs, doc(cfg(not(feature = "cache_aligned"))))]
+    #[inline]
+    pub fn from_vec(data: Vec<T>) -> Self {
+        Self { data }
+    }
+
     /// Allocates a new [MutableBuffer] with `len` and capacity to be at least `len`
     /// where data is zeroed.
     /// # Example
@@ -68,9 +86,11 @@ impl<T: NativeType> MutableBuffer<T> {
     /// ```
     #[inline]
     pub fn from_len_zeroed(len: usize) -> Self {
-        Self {
-            data: vec![T::default(); len],
-        }
+        #[cfg(not(feature = "cache_aligned"))]
+        let data = vec![T::default(); len];
+        #[cfg(feature = "cache_aligned")]
+        let data = Vec::from_len_zeroed(len);
+        Self { data }
     }
 
     /// Ensures that this buffer has at least `self.len + additional` bytes. This re-allocates iff

diff --git a/src/lib.rs b/src/lib.rs
@@ -4,13 +4,17 @@
 
 #[macro_use]
 pub mod array;
+#[cfg(feature = "cache_aligned")]
+mod alloc;
 pub mod bitmap;
 pub mod buffer;
 mod endianess;
 pub mod error;
 pub mod scalar;
 pub mod trusted_len;
 pub mod types;
+#[cfg(feature = "cache_aligned")]
+mod vec;
 
 #[cfg(feature = "compute")]
 #[cfg_attr(docsrs, doc(cfg(feature = "compute")))]