From 7fb641f2a0da875c471660eb4856543872b15746 Mon Sep 17 00:00:00 2001
From: "Jorge C. Leitao" <jorgecarleitao@gmail.com>
Date: Thu, 23 Sep 2021 21:46:49 +0000
Subject: [PATCH] Made Vec optional.

---
 Cargo.toml                   |   1 +
 src/alloc/alignment.rs       | 119 +++++++++++++
 src/alloc/mod.rs             | 128 ++++++++++++++
 src/buffer/bytes.rs          |   7 +-
 src/buffer/immutable.rs      |  10 ++
 src/buffer/mutable.rs        |  32 +++-
 src/lib.rs                   |   4 +
 src/vec.rs                   | 314 +++++++++++++++++++++++++++++++++++
 tests/it/buffer/immutable.rs |   8 +
 9 files changed, 616 insertions(+), 7 deletions(-)
 create mode 100644 src/alloc/alignment.rs
 create mode 100644 src/alloc/mod.rs
 create mode 100644 src/vec.rs

diff --git a/Cargo.toml b/Cargo.toml
index e00dcbce12b..9e515dc16d5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -125,6 +125,7 @@ compute = ["strength_reduce", "multiversion", "lexical-core", "ahash"]
 io_parquet = ["parquet2", "io_ipc", "base64", "futures"]
 benchmarks = ["rand"]
 simd = ["packed_simd"]
+cache_aligned = []
 
 [package.metadata.cargo-all-features]
 skip_feature_sets = [
diff --git a/src/alloc/alignment.rs b/src/alloc/alignment.rs
new file mode 100644
index 00000000000..dbf4602f83a
--- /dev/null
+++ b/src/alloc/alignment.rs
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: Below code is written for spatial/temporal prefetcher optimizations. Memory allocation
+// should align well with usage pattern of cache access and block sizes on layers of storage levels from
+// registers to non-volatile memory. These alignments are all cache aware alignments incorporated
+// from [cuneiform](https://crates.io/crates/cuneiform) crate. This approach mimicks Intel TBB's
+// cache_aligned_allocator which exploits cache locality and minimizes prefetch signals
+// resulting in less round trip time between the layers of storage.
+// For further info: https://software.intel.com/en-us/node/506094
+
+// 32-bit architecture and things other than netburst microarchitecture are using 64 bytes.
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "x86")]
+pub const ALIGNMENT: usize = 1 << 6;
+
+// Intel x86_64:
+// L2D streamer from L1:
+// Loads data or instructions from memory to the second-level cache. To use the streamer,
+// organize the data or instructions in blocks of 128 bytes, aligned on 128 bytes.
+// - https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "x86_64")]
+pub const ALIGNMENT: usize = 1 << 7;
+
+// 24Kc:
+// Data Line Size
+// - https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MD00346-2B-24K-DTS-04.00.pdf
+// - https://gitlab.e.foundation/e/devices/samsung/n7100/stable_android_kernel_samsung_smdk4412/commit/2dbac10263b2f3c561de68b4c369bc679352ccee
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "mips")]
+pub const ALIGNMENT: usize = 1 << 5;
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "mips64")]
+pub const ALIGNMENT: usize = 1 << 5;
+
+// Defaults for powerpc
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "powerpc")]
+pub const ALIGNMENT: usize = 1 << 5;
+
+// Defaults for the ppc 64
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "powerpc64")]
+pub const ALIGNMENT: usize = 1 << 6;
+
+// e.g.: sifive
+// - https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/riscv/sifive-l2-cache.txt#L41
+// in general all of them are the same.
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "riscv")]
+pub const ALIGNMENT: usize = 1 << 6;
+
+// This size is same across all hardware for this architecture.
+// - https://docs.huihoo.com/doxygen/linux/kernel/3.7/arch_2s390_2include_2asm_2cache_8h.html
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "s390x")]
+pub const ALIGNMENT: usize = 1 << 8;
+
+// This size is same across all hardware for this architecture.
+// - https://docs.huihoo.com/doxygen/linux/kernel/3.7/arch_2sparc_2include_2asm_2cache_8h.html#a9400cc2ba37e33279bdbc510a6311fb4
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "sparc")]
+pub const ALIGNMENT: usize = 1 << 5;
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "sparc64")]
+pub const ALIGNMENT: usize = 1 << 6;
+
+// On ARM cache line sizes are fixed. both v6 and v7.
+// Need to add board specific or platform specific things later.
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "thumbv6")]
+pub const ALIGNMENT: usize = 1 << 5;
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "thumbv7")]
+pub const ALIGNMENT: usize = 1 << 5;
+
+// Operating Systems cache size determines this.
+// Currently no way to determine this without runtime inference.
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "wasm32")]
+pub const ALIGNMENT: usize = 1 << 6;
+
+// Same as v6 and v7.
+// List goes like that:
+// Cortex A, M, R, ARM v7, v7-M, Krait and NeoverseN uses this size.
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "arm")]
+pub const ALIGNMENT: usize = 1 << 5;
+
+// Combined from 4 sectors. Volta says 128.
+// Prevent chunk optimizations better to go to the default size.
+// If you have smaller data with less padded functionality then use 32 with force option.
+// - https://devtalk.nvidia.com/default/topic/803600/variable-cache-line-width-/
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "nvptx")]
+pub const ALIGNMENT: usize = 1 << 7;
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "nvptx64")]
+pub const ALIGNMENT: usize = 1 << 7;
+
+// This size is same across all hardware for this architecture.
+/// Cache and allocation multiple alignment size
+#[cfg(target_arch = "aarch64")]
+pub const ALIGNMENT: usize = 1 << 6;
diff --git a/src/alloc/mod.rs b/src/alloc/mod.rs
new file mode 100644
index 00000000000..36b9f721583
--- /dev/null
+++ b/src/alloc/mod.rs
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines memory-related functions, such as allocate/deallocate/reallocate memory
+//! regions, cache and allocation alignments.
+
+use std::mem::size_of;
+use std::ptr::NonNull;
+use std::{
+    alloc::{handle_alloc_error, Layout},
+    sync::atomic::AtomicIsize,
+};
+
+use crate::types::NativeType;
+
+mod alignment;
+
+pub use alignment::ALIGNMENT;
+
+// If this number is not zero after all objects have been `drop`, there is a memory leak
+static mut ALLOCATIONS: AtomicIsize = AtomicIsize::new(0);
+
+/// # Safety
+/// This pointer may only be used to check if memory is allocated.
+#[inline]
+pub unsafe fn dangling<T: NativeType>() -> NonNull<T> {
+    NonNull::new_unchecked(ALIGNMENT as *mut T)
+}
+
+/// Allocates a cache-aligned memory region of `size` bytes with uninitialized values.
+/// This is more performant than using [allocate_aligned_zeroed] when all bytes will have
+/// an unknown or non-zero value and is semantically similar to `malloc`.
+pub fn allocate_aligned<T: NativeType>(size: usize) -> NonNull<T> {
+    unsafe {
+        if size == 0 {
+            dangling()
+        } else {
+            let size = size * size_of::<T>();
+            ALLOCATIONS.fetch_add(size as isize, std::sync::atomic::Ordering::SeqCst);
+
+            let layout = Layout::from_size_align_unchecked(size, ALIGNMENT);
+            let raw_ptr = std::alloc::alloc(layout) as *mut T;
+            NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout))
+        }
+    }
+}
+
+/// Allocates a cache-aligned memory region of `size` bytes with `0` on all of them.
+/// This is more performant than using [allocate_aligned] and setting all bytes to zero
+/// and is semantically similar to `calloc`.
+pub fn allocate_aligned_zeroed<T: NativeType>(size: usize) -> NonNull<T> {
+    unsafe {
+        if size == 0 {
+            dangling()
+        } else {
+            let size = size * size_of::<T>();
+            ALLOCATIONS.fetch_add(size as isize, std::sync::atomic::Ordering::SeqCst);
+
+            let layout = Layout::from_size_align_unchecked(size, ALIGNMENT);
+            let raw_ptr = std::alloc::alloc_zeroed(layout) as *mut T;
+            NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout))
+        }
+    }
+}
+
+/// Frees memory previously allocated by [`allocate_aligned_zeroed`] or [`allocate_aligned`].
+/// # Safety
+/// This function is sound iff:
+///
+/// * `ptr` was allocated by [`allocate_aligned_zeroed`] or [`allocate_aligned`]
+/// * `size` must be the same size that was used to allocate that block of memory.
+pub unsafe fn free_aligned<T: NativeType>(ptr: NonNull<T>, size: usize) {
+    if size != 0 {
+        let size = size * size_of::<T>();
+        ALLOCATIONS.fetch_sub(size as isize, std::sync::atomic::Ordering::SeqCst);
+        std::alloc::dealloc(
+            ptr.as_ptr() as *mut u8,
+            Layout::from_size_align_unchecked(size, ALIGNMENT),
+        );
+    }
+}
+
+/// Reallocates memory previously allocated by [`allocate_aligned_zeroed`] or [`allocate_aligned`].
+/// # Safety
+/// This function is sound iff `ptr` was previously allocated by `allocate_aligned` or `allocate_aligned_zeroed` for `old_size` items.
+pub unsafe fn reallocate<T: NativeType>(
+    ptr: NonNull<T>,
+    old_size: usize,
+    new_size: usize,
+) -> NonNull<T> {
+    if old_size == 0 {
+        return allocate_aligned(new_size);
+    }
+
+    if new_size == 0 {
+        free_aligned(ptr, old_size);
+        return dangling();
+    }
+    let old_size = old_size * size_of::<T>();
+    let new_size = new_size * size_of::<T>();
+
+    ALLOCATIONS.fetch_add(
+        new_size as isize - old_size as isize,
+        std::sync::atomic::Ordering::SeqCst,
+    );
+    let raw_ptr = std::alloc::realloc(
+        ptr.as_ptr() as *mut u8,
+        Layout::from_size_align_unchecked(old_size, ALIGNMENT),
+        new_size,
+    ) as *mut T;
+    NonNull::new(raw_ptr).unwrap_or_else(|| {
+        handle_alloc_error(Layout::from_size_align_unchecked(new_size, ALIGNMENT))
+    })
+}
diff --git a/src/buffer/bytes.rs b/src/buffer/bytes.rs
index 65a959a91ea..e057f0fd942 100644
--- a/src/buffer/bytes.rs
+++ b/src/buffer/bytes.rs
@@ -7,6 +7,8 @@ use std::{ptr::NonNull, sync::Arc};
 
 use crate::ffi;
 use crate::types::NativeType;
+#[cfg(feature = "cache_aligned")]
+use crate::vec::AlignedVec as Vec;
 
 /// Mode of deallocating memory regions
 pub enum Deallocation {
@@ -89,7 +91,10 @@ impl<T: NativeType> Drop for Bytes<T> {
     fn drop(&mut self) {
         match &self.deallocation {
             Deallocation::Native(capacity) => unsafe {
-                Vec::from_raw_parts(self.ptr.as_ptr(), self.len, *capacity);
+                #[cfg(feature = "cache_aligned")]
+                let _ = Vec::from_raw_parts(self.ptr, self.len, *capacity);
+                #[cfg(not(feature = "cache_aligned"))]
+                let _ = Vec::from_raw_parts(self.ptr.as_ptr(), self.len, *capacity);
             },
             // foreign interface knows how to deallocate itself.
             Deallocation::Foreign(_) => (),
diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs
index cd307434671..5632b12d90a 100644
--- a/src/buffer/immutable.rs
+++ b/src/buffer/immutable.rs
@@ -52,6 +52,16 @@ impl<T: NativeType> Buffer<T> {
         MutableBuffer::from_len_zeroed(length).into()
     }
 
+    /// Takes ownership of [`Vec`].
+    /// # Implementation
+    /// This function is `O(1)`
+    #[cfg(not(feature = "cache_aligned"))]
+    #[cfg_attr(docsrs, doc(cfg(not(feature = "cache_aligned"))))]
+    #[inline]
+    pub fn from_vec(data: Vec<T>) -> Self {
+        MutableBuffer::from_vec(data).into()
+    }
+
     /// Auxiliary method to create a new Buffer
     pub(crate) fn from_bytes(bytes: Bytes<T>) -> Self {
         let length = bytes.len();
diff --git a/src/buffer/mutable.rs b/src/buffer/mutable.rs
index c19961f40cb..f4ac85e1996 100644
--- a/src/buffer/mutable.rs
+++ b/src/buffer/mutable.rs
@@ -6,13 +6,15 @@ use crate::trusted_len::TrustedLen;
 use crate::types::{BitChunk, NativeType};
 
 use super::bytes::{Bytes, Deallocation};
+#[cfg(feature = "cache_aligned")]
+use crate::vec::AlignedVec as Vec;
 
 use super::immutable::Buffer;
 
 /// A [`MutableBuffer`] is this crates' interface to store types that are byte-like such as `i32`.
-/// It behaves like a [`Vec`], with the following differences:
-/// * memory is allocated along cache lines and in multiple of 64 bytes.
-/// * it can only hold types supported by the arrow format (`u8-u64`, `i8-i128`, `f32,f64` and [`crate::types::days_ms`])
+/// It behaves like a [`Vec`] but can only hold types supported by the arrow format
+/// (`u8-u64`, `i8-i128`, `f32,f64`, [`crate::types::days_ms`] and [`crate::types::months_days_ns`]).
+/// When the feature `cache_aligned` is active, memory is allocated along cache lines and in multiple of 64 bytes.
 /// A [`MutableBuffer`] can be converted to a [`Buffer`] via `.into`.
 /// # Example
 /// ```
@@ -28,6 +30,14 @@ pub struct MutableBuffer<T: NativeType> {
     data: Vec<T>,
 }
 
+#[cfg(not(feature = "cache_aligned"))]
+#[cfg_attr(docsrs, doc(cfg(not(feature = "cache_aligned"))))]
+impl<T: NativeType> From<MutableBuffer<T>> for Vec<T> {
+    fn from(data: MutableBuffer<T>) -> Self {
+        data.data
+    }
+}
+
 impl<T: NativeType> std::fmt::Debug for MutableBuffer<T> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         std::fmt::Debug::fmt(&**self, f)
@@ -55,6 +65,14 @@ impl<T: NativeType> MutableBuffer<T> {
         }
     }
 
+    /// Takes ownership of [`Vec`].
+    #[cfg(not(feature = "cache_aligned"))]
+    #[cfg_attr(docsrs, doc(cfg(not(feature = "cache_aligned"))))]
+    #[inline]
+    pub fn from_vec(data: Vec<T>) -> Self {
+        Self { data }
+    }
+
     /// Allocates a new [MutableBuffer] with `len` and capacity to be at least `len`
     /// where data is zeroed.
     /// # Example
@@ -68,9 +86,11 @@ impl<T: NativeType> MutableBuffer<T> {
     /// ```
     #[inline]
     pub fn from_len_zeroed(len: usize) -> Self {
-        Self {
-            data: vec![T::default(); len],
-        }
+        #[cfg(not(feature = "cache_aligned"))]
+        let data = vec![T::default(); len];
+        #[cfg(feature = "cache_aligned")]
+        let data = Vec::from_len_zeroed(len);
+        Self { data }
     }
 
     /// Ensures that this buffer has at least `self.len + additional` bytes. This re-allocates iff
diff --git a/src/lib.rs b/src/lib.rs
index 7a2543d6028..a6ca6598496 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -4,6 +4,8 @@
 
 #[macro_use]
 pub mod array;
+#[cfg(feature = "cache_aligned")]
+mod alloc;
 pub mod bitmap;
 pub mod buffer;
 mod endianess;
@@ -11,6 +13,8 @@ pub mod error;
 pub mod scalar;
 pub mod trusted_len;
 pub mod types;
+#[cfg(feature = "cache_aligned")]
+mod vec;
 
 #[cfg(feature = "compute")]
 #[cfg_attr(docsrs, doc(cfg(feature = "compute")))]
diff --git a/src/vec.rs b/src/vec.rs
new file mode 100644
index 00000000000..32b24856c0e
--- /dev/null
+++ b/src/vec.rs
@@ -0,0 +1,314 @@
+use std::iter::FromIterator;
+use std::ptr::NonNull;
+
+use crate::alloc;
+use crate::types::NativeType;
+
+/// Returns the nearest number that is `>=` than `num` and is a multiple of 64
+#[inline]
+fn round_upto_multiple_of_64(num: usize) -> usize {
+    round_upto_power_of_2(num, 64)
+}
+
+/// Returns the nearest multiple of `factor` that is `>=` than `num`. Here `factor` must
+/// be a power of 2.
+fn round_upto_power_of_2(num: usize, factor: usize) -> usize {
+    debug_assert!(factor > 0 && (factor & (factor - 1)) == 0);
+    (num + (factor - 1)) & !(factor - 1)
+}
+
+#[inline]
+fn capacity_multiple_of_64<T: NativeType>(capacity: usize) -> usize {
+    round_upto_multiple_of_64(capacity * std::mem::size_of::<T>()) / std::mem::size_of::<T>()
+}
+
+pub struct AlignedVec<T: NativeType> {
+    // dangling iff capacity = 0
+    ptr: NonNull<T>,
+    // invariant: len <= capacity
+    len: usize,
+    capacity: usize,
+}
+
+impl<T: NativeType> AlignedVec<T> {
+    #[inline]
+    pub fn new() -> Self {
+        let ptr = alloc::allocate_aligned(0);
+        Self {
+            ptr,
+            len: 0,
+            capacity: 0,
+        }
+    }
+
+    #[inline]
+    pub fn clear(&mut self) {
+        self.len = 0
+    }
+
+    #[inline]
+    pub fn capacity(&self) -> usize {
+        self.capacity
+    }
+
+    #[inline]
+    pub fn truncate(&mut self, len: usize) {
+        if len < self.len {
+            self.len = len;
+        }
+    }
+
+    /// Sets the length of this buffer.
+    /// # Safety:
+    /// The caller must uphold the following invariants:
+    /// * ensure no reads are performed on any
+    ///     item within `[len, capacity - len]`
+    /// * ensure `len <= self.capacity()`
+    #[inline]
+    pub unsafe fn set_len(&mut self, len: usize) {
+        debug_assert!(len <= self.capacity());
+        self.len = len;
+    }
+
+    /// Returns the data stored in this buffer as a slice.
+    #[inline]
+    pub fn as_slice(&self) -> &[T] {
+        self
+    }
+
+    /// Returns the data stored in this buffer as a mutable slice.
+    #[inline]
+    pub fn as_mut_slice(&mut self) -> &mut [T] {
+        self
+    }
+
+    /// Returns a raw pointer to this buffer's internal memory
+    /// This pointer is guaranteed to be aligned along cache-lines.
+    #[inline]
+    pub fn as_ptr(&self) -> *const T {
+        self.ptr.as_ptr()
+    }
+
+    /// Returns a mutable raw pointer to this buffer's internal memory
+    /// This pointer is guaranteed to be aligned along cache-lines.
+    #[inline]
+    pub fn as_mut_ptr(&mut self) -> *mut T {
+        self.ptr.as_ptr()
+    }
+
+    #[inline]
+    pub fn with_capacity(capacity: usize) -> Self {
+        let capacity = capacity_multiple_of_64::<T>(capacity);
+        let ptr = alloc::allocate_aligned(capacity);
+        Self {
+            ptr,
+            len: 0,
+            capacity,
+        }
+    }
+
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    #[inline(always)]
+    pub fn reserve(&mut self, additional: usize) {
+        let required_cap = self.len + additional;
+        if required_cap > self.capacity {
+            // JUSTIFICATION
+            //  Benefit
+            //      necessity
+            //  Soundness
+            //      `self.data` is valid for `self.capacity`.
+            self.ptr = unsafe { alloc::reallocate(self.ptr, self.capacity, required_cap) };
+            self.capacity = required_cap;
+        }
+    }
+
+    #[inline(always)]
+    pub fn resize(&mut self, new_len: usize, value: T) {
+        if new_len > self.len {
+            if self.capacity == 0 && value == T::default() {
+                // edge case where the allocate
+                let required_cap = capacity_multiple_of_64::<T>(new_len);
+                let ptr = alloc::allocate_aligned_zeroed(required_cap);
+                self.ptr = ptr;
+                self.capacity = required_cap;
+                self.len = new_len;
+                return;
+            }
+
+            let diff = new_len - self.len;
+            self.reserve(diff);
+            unsafe {
+                // write the value
+                let mut ptr = self.ptr.as_ptr().add(self.len);
+                (0..diff).for_each(|_| {
+                    std::ptr::write(ptr, value);
+                    ptr = ptr.add(1);
+                })
+            }
+        }
+        // this truncates the buffer when new_len < self.len
+        self.len = new_len;
+    }
+
+    #[inline]
+    pub fn extend_from_slice(&mut self, items: &[T]) {
+        let additional = items.len();
+        self.reserve(additional);
+        unsafe {
+            let dst = self.ptr.as_ptr().add(self.len);
+            let src = items.as_ptr();
+            std::ptr::copy_nonoverlapping(src, dst, additional)
+        }
+        self.len += additional;
+    }
+
+    #[inline]
+    pub fn push(&mut self, item: T) {
+        self.reserve(1);
+        unsafe {
+            let dst = self.ptr.as_ptr().add(self.len) as *mut T;
+            std::ptr::write(dst, item);
+        }
+        self.len += 1;
+    }
+
+    pub fn shrink_to_fit(&mut self) {
+        let new_capacity = capacity_multiple_of_64::<T>(self.len);
+        if new_capacity < self.capacity {
+            // JUSTIFICATION
+            //  Benefit
+            //      necessity
+            //  Soundness
+            //      `self.ptr` is valid for `self.capacity`.
+            let ptr = unsafe { alloc::reallocate(self.ptr, self.capacity, new_capacity) };
+
+            self.ptr = ptr;
+            self.capacity = new_capacity;
+        }
+    }
+
+    #[inline]
+    pub fn from_len_zeroed(len: usize) -> Self {
+        let new_capacity = capacity_multiple_of_64::<T>(len);
+        let ptr = alloc::allocate_aligned_zeroed(new_capacity);
+        Self {
+            ptr,
+            len,
+            capacity: new_capacity,
+        }
+    }
+
+    #[inline]
+    pub unsafe fn from_raw_parts(ptr: NonNull<T>, length: usize, capacity: usize) -> Self {
+        Self {
+            ptr,
+            capacity,
+            len: length,
+        }
+    }
+}
+
+impl<T: NativeType> Default for AlignedVec<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<T: NativeType> std::ops::Deref for AlignedVec<T> {
+    type Target = [T];
+
+    #[inline]
+    fn deref(&self) -> &[T] {
+        unsafe { std::slice::from_raw_parts(self.as_ptr(), self.len) }
+    }
+}
+
+impl<T: NativeType> std::ops::DerefMut for AlignedVec<T> {
+    #[inline]
+    fn deref_mut(&mut self) -> &mut [T] {
+        unsafe { std::slice::from_raw_parts_mut(self.as_mut_ptr(), self.len) }
+    }
+}
+
+impl<A: NativeType> Extend<A> for AlignedVec<A> {
+    fn extend<T: IntoIterator<Item = A>>(&mut self, iter: T) {
+        let mut iterator = iter.into_iter();
+        let (lower, _) = iterator.size_hint();
+        let additional = lower;
+        self.reserve(additional);
+
+        // this is necessary because of https://github.com/rust-lang/rust/issues/32155
+        let mut len = SetLenOnDrop::new(&mut self.len);
+        let mut dst = unsafe { self.ptr.as_ptr().add(len.local_len) as *mut A };
+        let capacity = self.capacity;
+
+        while len.local_len < capacity {
+            if let Some(item) = iterator.next() {
+                unsafe {
+                    std::ptr::write(dst, item);
+                    dst = dst.add(1);
+                }
+                len.local_len += 1;
+            } else {
+                break;
+            }
+        }
+        drop(len);
+
+        iterator.for_each(|item| self.push(item));
+    }
+}
+
+struct SetLenOnDrop<'a> {
+    len: &'a mut usize,
+    local_len: usize,
+}
+
+impl<'a> SetLenOnDrop<'a> {
+    #[inline]
+    fn new(len: &'a mut usize) -> Self {
+        SetLenOnDrop {
+            local_len: *len,
+            len,
+        }
+    }
+}
+
+impl Drop for SetLenOnDrop<'_> {
+    #[inline]
+    fn drop(&mut self) {
+        *self.len = self.local_len;
+    }
+}
+
+impl<T: NativeType> FromIterator<T> for AlignedVec<T> {
+    fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> Self {
+        let mut iterator = iter.into_iter();
+
+        // first iteration, which will likely reserve sufficient space for the buffer.
+        let mut buffer = match iterator.next() {
+            None => AlignedVec::new(),
+            Some(element) => {
+                let (lower, _) = iterator.size_hint();
+                let mut buffer = AlignedVec::with_capacity(lower.saturating_add(1));
+                unsafe {
+                    std::ptr::write(buffer.as_mut_ptr(), element);
+                    buffer.len = 1;
+                }
+                buffer
+            }
+        };
+
+        buffer.extend(iterator);
+        buffer
+    }
+}
diff --git a/tests/it/buffer/immutable.rs b/tests/it/buffer/immutable.rs
index db969849053..5fbf711ec5f 100644
--- a/tests/it/buffer/immutable.rs
+++ b/tests/it/buffer/immutable.rs
@@ -67,3 +67,11 @@ fn debug() {
     let a = format!("{:?}", buffer);
     assert_eq!(a, "[1, 2]")
 }
+
+#[cfg(not(feature = "cache_aligned"))]
+#[test]
+fn from_vec() {
+    let buffer = Buffer::<i32>::from_vec(vec![0, 1, 2]);
+    assert_eq!(buffer.len(), 3);
+    assert_eq!(buffer.as_slice(), &[0, 1, 2]);
+}