From db84c25b43de5925384677d820af96d44ef9323c Mon Sep 17 00:00:00 2001
From: Igor Aleksanov <popzxc@yandex.ru>
Date: Mon, 5 Aug 2024 16:37:54 +0400
Subject: [PATCH 1/2] feat: Make boojum compile with modern compiler

---
 rust-toolchain.toml                          | 2 +-
 src/cs/implementations/fast_serialization.rs | 4 ++--
 src/lib.rs                                   | 1 -
 3 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index a671fa6..bc5d1d6 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,2 +1,2 @@
 [toolchain]
-channel = "nightly-2024-05-07"
+channel = "nightly-2024-08-01"
diff --git a/src/cs/implementations/fast_serialization.rs b/src/cs/implementations/fast_serialization.rs
index ea7301c..7288270 100644
--- a/src/cs/implementations/fast_serialization.rs
+++ b/src/cs/implementations/fast_serialization.rs
@@ -272,7 +272,7 @@ where
 {
     fn write_into_buffer<W: Write>(&self, mut dst: W) -> Result<(), Box<dyn Error>> {
         // we avoid transmute here
-        let flattened_self = self[..].flatten();
+        let flattened_self = self[..].as_flattened();
 
         let len_as_base = flattened_self.len();
         let len_le_bytes = (len_as_base as u64).to_le_bytes();
@@ -346,7 +346,7 @@ where
 {
     fn write_into_buffer<W: Write>(&self, mut dst: W) -> Result<(), Box<dyn Error>> {
         // we avoid transmute here
-        let flattened_self = self[..].flatten();
+        let flattened_self = self[..].as_flattened();
 
         let len_as_base = flattened_self.len();
         let len_le_bytes = (len_as_base as u64).to_le_bytes();
diff --git a/src/lib.rs b/src/lib.rs
index 8eb9e9f..6978fa7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -53,7 +53,6 @@
 #![feature(vec_push_within_capacity)]
 #![feature(return_position_impl_trait_in_trait)]
 #![feature(type_changing_struct_update)]
-#![feature(slice_flatten)]
 #![cfg_attr(feature = "include_packed_simd", feature(stdsimd))]
 
 pub mod algebraic_props;

From f88506c4dc048522364becff2651f7f7a6f77ecd Mon Sep 17 00:00:00 2001
From: Igor Aleksanov <popzxc@yandex.ru>
Date: Mon, 5 Aug 2024 17:14:27 +0400
Subject: [PATCH 2/2] Remove old optimizations that rely on packed_simd

---
 .github/workflows/ci.yaml                     |  19 -
 Cargo.toml                                    |   5 -
 src/field/goldilocks/arm_asm_packed_impl.rs   | 858 ------------------
 src/field/goldilocks/mod.rs                   |  15 -
 src/implementations/poseidon2/mod.rs          |  47 +-
 .../poseidon2/state_generic_impl.rs           |   4 -
 .../poseidon2/state_vectorized_double.rs      | 415 ---------
 src/lib.rs                                    |   2 +-
 8 files changed, 7 insertions(+), 1358 deletions(-)
 delete mode 100644 src/field/goldilocks/arm_asm_packed_impl.rs
 delete mode 100644 src/implementations/poseidon2/state_vectorized_double.rs

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index f5200a1..a984dd9 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -17,25 +17,6 @@ jobs:
       - uses: actions-rust-lang/setup-rust-toolchain@v1
       - run: cargo build --verbose
       - run: cargo test --verbose --all
-       
-  build_old:
-    name: cargo build and test (packed_simd)
-    strategy:
-      matrix:
-        # Needs big runners to run tests
-        # Only macos-13-xlarge is Apple Silicon, as per:
-        # https://docs.github.com/en/actions/using-github-hosted-runners/about-larger-runners/about-larger-runners#about-macos-larger-runners
-        os: [ubuntu-22.04-github-hosted-16core, macos-13-xlarge]
-    runs-on: ${{ matrix.os }}
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions-rust-lang/setup-rust-toolchain@v1
-        with:
-          toolchain: nightly-2023-05-31 
-
-      # Still compile the old rust nightly with packed simd - until we have a good replacement in poseidon.
-      - run: RUSTFLAGS=-Awarnings cargo +nightly-2023-05-31 build --features include_packed_simd
-      - run: RUSTFLAGS=-Awarnings cargo +nightly-2023-05-31 test --features include_packed_simd
 
   formatting:
     name: cargo fmt
diff --git a/Cargo.toml b/Cargo.toml
index a628cce..fbf9674 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,7 +30,6 @@ itertools = "0.10"
 blake2 = "0.10"
 sha2 = "0.10"
 num-modular = "0.5.1"
-packed_simd = { version = "0.3.9" , optional = true}
 pairing = { package = "pairing_ce", version = "=0.28.6" }
 crypto-bigint = "0.5"
 convert_case = "0.6"
@@ -61,9 +60,5 @@ opt-level = 3
 [features]
 # If enabled, logs will be using trace, if disabled, they will be printed to stdout.
 log_tracing = ["tracing"]
-# Currently packed_simd is no longer working with the newest nightly.
-# But we still keep it as a feature, as we didn't migrate all the code, and 
-# some people might want to use older rust nightly, to be able to gain some performance.
-include_packed_simd = ["packed_simd"]
 cr_paranoia_mode = []
 debug_track = []
diff --git a/src/field/goldilocks/arm_asm_packed_impl.rs b/src/field/goldilocks/arm_asm_packed_impl.rs
deleted file mode 100644
index 03399c4..0000000
--- a/src/field/goldilocks/arm_asm_packed_impl.rs
+++ /dev/null
@@ -1,858 +0,0 @@
-use crate::cs::implementations::utils::precompute_twiddles_for_fft;
-use crate::cs::traits::GoodAllocator;
-use crate::field::{Field, PrimeField};
-use crate::worker::Worker;
-use packed_simd::shuffle;
-use std::ops::{Add, BitOr, Sub};
-use std::usize;
-
-use super::GoldilocksField;
-
-// we need max of an alignment of u64x4 and u64x8 in this implementation, so 64
-
-#[derive(PartialEq, Eq, Hash, Clone, Copy)]
-#[repr(C, align(64))]
-pub struct MixedGL(pub [GoldilocksField; 16]);
-
-// we also need holder for SIMD targets, because u64x4 has smaller alignment than u64x8
-#[derive(Clone, Copy)]
-#[repr(C, align(64))]
-struct U64x4Holder([packed_simd::u64x4; 4]);
-
-impl std::fmt::Debug for MixedGL {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:?}", self.0)
-    }
-}
-
-impl std::fmt::Display for MixedGL {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:?}", self.0)
-    }
-}
-
-impl MixedGL {
-    pub const ORDER_BITS: usize = GoldilocksField::ORDER_BITS;
-    pub const ORDER: u64 = GoldilocksField::ORDER;
-    pub const TWO_ADICITY: usize = GoldilocksField::TWO_ADICITY;
-    pub const T: u64 = (Self::ORDER - 1) >> Self::TWO_ADICITY;
-    pub const BARRETT: u128 = 18446744078004518912; // 0x10000000100000000
-    pub const EPSILON: u64 = (1 << 32) - 1;
-    pub const EPSILON_VECTOR: packed_simd::u64x4 = packed_simd::u64x4::splat(Self::EPSILON);
-    pub const EPSILON_VECTOR_D: packed_simd::u64x8 = packed_simd::u64x8::splat(Self::EPSILON);
-
-    #[inline(always)]
-    pub fn new() -> Self {
-        Self([GoldilocksField::ZERO; 16])
-    }
-
-    #[inline(always)]
-    pub fn from_constant(value: GoldilocksField) -> Self {
-        Self([value; 16])
-    }
-
-    #[inline(always)]
-    pub fn from_array(value: [GoldilocksField; 16]) -> Self {
-        Self(value)
-    }
-
-    #[inline(always)]
-    #[unroll::unroll_for_loops]
-    pub fn to_reduced(&mut self) -> &mut Self {
-        let mut a_u64 = Self::as_u64x4_arrays(self);
-
-        for i in 0..4 {
-            let a = a_u64.0[i];
-            let a_reduced = a.add(Self::EPSILON_VECTOR);
-            let cmp = a_reduced.lt(Self::EPSILON_VECTOR);
-            let res = cmp.select(a_reduced, a);
-
-            a_u64.0[i] = res;
-        }
-
-        unsafe {
-            *self = Self::from_u64x4_arrays(a_u64);
-        }
-
-        self
-    }
-
-    #[inline(always)]
-    #[unroll::unroll_for_loops]
-    pub fn mul_constant_assign(&'_ mut self, other: &GoldilocksField) -> &mut Self {
-        for i in 0..16 {
-            self.0[i].mul_assign(other);
-        }
-
-        self
-    }
-
-    #[inline(always)]
-    #[unroll::unroll_for_loops]
-    fn mul_assign_impl(&mut self, other: &Self) -> &mut Self {
-        for i in 0..16 {
-            self.0[i].mul_assign(&other.0[i]);
-        }
-
-        self
-    }
-
-    #[inline(always)]
-    #[unroll::unroll_for_loops]
-    fn add_assign_impl(&mut self, other: &Self) -> &mut Self {
-        let mut a_u64 = Self::as_u64x4_arrays(self);
-        let b_u64 = Self::as_u64x4_arrays(other);
-
-        for i in 0..4 {
-            let a = a_u64.0[i];
-            let b = b_u64.0[i];
-            //additional reduction over b
-            let b_reduced = b.add(Self::EPSILON_VECTOR);
-            let cmp = b_reduced.lt(Self::EPSILON_VECTOR);
-            let b = cmp.select(b_reduced, b);
-            //a+b
-            let sum = a.add(b);
-            let sum_reduced = sum.add(Self::EPSILON_VECTOR);
-            let cmp0 = sum_reduced.lt(sum);
-            let cmp1 = sum.lt(a);
-            let reduce_flag = cmp0.bitor(cmp1);
-            let res = reduce_flag.select(sum_reduced, sum);
-
-            a_u64.0[i] = res;
-        }
-
-        unsafe {
-            *self = Self::from_u64x4_arrays(a_u64);
-        }
-
-        self
-    }
-
-    #[inline(always)]
-    #[unroll::unroll_for_loops]
-    fn sub_assign_impl(&'_ mut self, other: &Self) -> &mut Self {
-        let mut a_u64 = Self::as_u64x4_arrays(self);
-        let b_u64 = Self::as_u64x4_arrays(other);
-
-        for i in 0..4 {
-            let a = a_u64.0[i];
-            let b = b_u64.0[i];
-            //additional reduction over b
-            let b_reduced = b.add(Self::EPSILON_VECTOR);
-            let cmp = b_reduced.lt(Self::EPSILON_VECTOR);
-            let b = cmp.select(b_reduced, b);
-            //a-b
-            let diff = a.sub(b);
-            let diff_reduced = diff.sub(Self::EPSILON_VECTOR);
-            let cmp = a.lt(b);
-            let res = cmp.select(diff_reduced, diff);
-
-            a_u64.0[i] = res;
-        }
-
-        unsafe {
-            *self = Self::from_u64x4_arrays(a_u64);
-        }
-
-        self
-    }
-
-    pub unsafe fn butterfly_1x1_impl(&mut self) -> &mut Self {
-        let [part1, part2] = MixedGL::as_u64x8_arrays(&*self);
-        let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 2, 4, 6, 8, 10, 12, 14]);
-        let v: packed_simd::u64x8 = shuffle!(part1, part2, [1, 3, 5, 7, 9, 11, 13, 15]);
-        //additional reduction over v
-        let v_reduced = v.add(Self::EPSILON_VECTOR_D);
-        let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D);
-        let v = cmp.select(v_reduced, v);
-        // u + v
-        let sum = u.add(v);
-        let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
-        let cmp0 = sum_reduced.lt(sum);
-        let cmp1 = sum.lt(u);
-        let reduce_flag = cmp0.bitor(cmp1);
-        let res1 = reduce_flag.select(sum_reduced, sum);
-        // u - v
-        let diff = u.sub(v);
-        let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
-        let cmp = u.lt(v);
-        let res2 = cmp.select(diff_reduced, diff);
-
-        let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 8, 1, 9, 2, 10, 3, 11]);
-        let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 12, 5, 13, 6, 14, 7, 15]);
-
-        *self = MixedGL::from_u64x8_arrays([part1, part2]);
-
-        self
-    }
-
-    pub unsafe fn butterfly_2x2_impl(&mut self) -> &mut Self {
-        let [part1, part2] = MixedGL::as_u64x8_arrays(&*self);
-        let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 1, 4, 5, 8, 9, 12, 13]);
-        let v: packed_simd::u64x8 = shuffle!(part1, part2, [2, 3, 6, 7, 10, 11, 14, 15]);
-        //additional reduction over v
-        let v_reduced = v.add(Self::EPSILON_VECTOR_D);
-        let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D);
-        let v = cmp.select(v_reduced, v);
-        // u + v
-        let sum = u.add(v);
-        let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
-        let cmp0 = sum_reduced.lt(sum);
-        let cmp1 = sum.lt(u);
-        let reduce_flag = cmp0.bitor(cmp1);
-        let res1 = reduce_flag.select(sum_reduced, sum);
-        // u - v
-        let diff = u.sub(v);
-        let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
-        let cmp = u.lt(v);
-        let res2 = cmp.select(diff_reduced, diff);
-
-        let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 1, 8, 9, 2, 3, 10, 11]);
-        let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 5, 12, 13, 6, 7, 14, 15]);
-
-        *self = MixedGL::from_u64x8_arrays([part1, part2]);
-
-        self
-    }
-
-    pub unsafe fn butterfly_4x4_impl(&mut self) -> &mut Self {
-        let [part1, part2] = MixedGL::as_u64x8_arrays(&*self);
-        let u: packed_simd::u64x8 = shuffle!(part1, part2, [0, 1, 2, 3, 8, 9, 10, 11]);
-        let v: packed_simd::u64x8 = shuffle!(part1, part2, [4, 5, 6, 7, 12, 13, 14, 15]);
-        //additional reduction over v
-        let v_reduced = v.add(Self::EPSILON_VECTOR_D);
-        let cmp = v_reduced.lt(Self::EPSILON_VECTOR_D);
-        let v = cmp.select(v_reduced, v);
-        // u + v
-        let sum = u.add(v);
-        let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
-        let cmp0 = sum_reduced.lt(sum);
-        let cmp1 = sum.lt(u);
-        let reduce_flag = cmp0.bitor(cmp1);
-        let res1 = reduce_flag.select(sum_reduced, sum);
-        // u - v
-        let diff = u.sub(v);
-        let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
-        let cmp = u.lt(v);
-        let res2 = cmp.select(diff_reduced, diff);
-
-        let part1: packed_simd::u64x8 = shuffle!(res1, res2, [0, 1, 2, 3, 8, 9, 10, 11]);
-        let part2: packed_simd::u64x8 = shuffle!(res1, res2, [4, 5, 6, 7, 12, 13, 14, 15]);
-
-        *self = MixedGL::from_u64x8_arrays([part1, part2]);
-
-        self
-    }
-
-    /// # Safety
-    ///
-    /// Pointers must be properly aligned for `MixedGL` type, should point to arrays of length 8, and should point
-    /// to memory that can be mutated.
-    /// No references to the same memory should exist when this function is called.
-    /// Pointers should be different.
-    pub unsafe fn butterfly_8x8_impl(this: *const u64, other: *const u64) {
-        debug_assert!(this.addr() % std::mem::align_of::<MixedGL>() == 0);
-        debug_assert!(other.addr() % std::mem::align_of::<MixedGL>() == 0);
-
-        let u = std::slice::from_raw_parts_mut(this as *mut u64, 8);
-        let v = std::slice::from_raw_parts_mut(other as *mut u64, 8);
-        let a = packed_simd::u64x8::from_slice_aligned(u);
-        let b = packed_simd::u64x8::from_slice_aligned(v);
-        //additional reduction over b
-        let b_reduced = b.add(Self::EPSILON_VECTOR_D);
-        let cmp = b_reduced.lt(Self::EPSILON_VECTOR_D);
-        let b = cmp.select(b_reduced, b);
-        // u + v
-        let sum = a.add(b);
-        let sum_reduced = sum.add(Self::EPSILON_VECTOR_D);
-        let cmp0 = sum_reduced.lt(sum);
-        let cmp1 = sum.lt(a);
-        let reduce_flag = cmp0.bitor(cmp1);
-        let res1 = reduce_flag.select(sum_reduced, sum);
-        // u - v
-        let diff = a.sub(b);
-        let diff_reduced = diff.sub(Self::EPSILON_VECTOR_D);
-        let cmp = a.lt(b);
-        let res2 = cmp.select(diff_reduced, diff);
-
-        res1.write_to_slice_aligned(u);
-        res2.write_to_slice_aligned(v);
-    }
-
-    /// # Safety
-    ///
-    /// Pointers must be properly aligned for `MixedGL` type, should point to arrays of length 16, and should point
-    /// to memory that can be mutated.
-    /// No references to the same memory should exist when this function is called.
-    /// Pointers should be different.
-    pub unsafe fn butterfly_16x16_impl(mut this: *mut u64, mut other: *mut u64) {
-        debug_assert!(this.addr() % std::mem::align_of::<MixedGL>() == 0);
-        debug_assert!(other.addr() % std::mem::align_of::<MixedGL>() == 0);
-
-        Self::butterfly_8x8_impl(this, other);
-        this = this.offset(8);
-        other = other.offset(8);
-        Self::butterfly_8x8_impl(this, other);
-    }
-
-    // pub unsafe fn butterfly_16x16_impl(
-    //     this: &mut Self,
-    //     other: &mut Self,
-    // ) {
-    //     let mut this_ptr = this.0.as_ptr() as *mut u64;
-    //     let mut other_ptr = other.0.as_ptr() as *mut u64;
-
-    //     debug_assert!(this_ptr.addr() % std::mem::align_of::<MixedGL>() == 0);
-    //     debug_assert!(other_ptr.addr() % std::mem::align_of::<MixedGL>() == 0);
-
-    //     Self::butterfly_8x8_impl(this_ptr, other_ptr);
-    //     this_ptr = this_ptr.offset(8);
-    //     other_ptr = other_ptr.offset(8);
-    //     Self::butterfly_8x8_impl(this_ptr, other_ptr);
-    // }
-
-    #[inline(always)]
-    pub fn from_field_array(input: [GoldilocksField; 16]) -> Self {
-        Self(input)
-    }
-
-    #[inline(always)]
-    fn as_u64x4_arrays(input: &Self) -> U64x4Holder {
-        // this preserves an alignment
-        unsafe { std::mem::transmute(*input) }
-    }
-
-    #[inline(always)]
-    pub(crate) fn as_u64x8_arrays(input: &Self) -> [packed_simd::u64x8; 2] {
-        // this preserves an alignment
-        unsafe { std::mem::transmute(*input) }
-    }
-
-    #[inline(always)]
-    unsafe fn from_u64x4_arrays(input: U64x4Holder) -> Self {
-        // this preserves an alignment
-        std::mem::transmute(input)
-    }
-
-    #[inline(always)]
-    pub(crate) unsafe fn from_u64x8_arrays(input: [packed_simd::u64x8; 2]) -> Self {
-        // this preserves an alignment
-        std::mem::transmute(input)
-    }
-
-    #[inline(always)]
-    pub fn vec_add_assign(a: &mut [Self], b: &[Self]) {
-        use crate::field::traits::field_like::PrimeFieldLike;
-        for (a, b) in a.iter_mut().zip(b.iter()) {
-            a.add_assign(b, &mut ());
-        }
-    }
-
-    #[inline(always)]
-    pub fn vec_mul_assign(a: &mut [Self], b: &[Self]) {
-        use crate::field::traits::field_like::PrimeFieldLike;
-        for (a, b) in a.iter_mut().zip(b.iter()) {
-            a.mul_assign(b, &mut ());
-        }
-    }
-}
-
-impl Default for MixedGL {
-    fn default() -> Self {
-        Self([GoldilocksField::ZERO; 16])
-    }
-}
-
-impl crate::field::traits::field_like::PrimeFieldLike for MixedGL {
-    type Base = GoldilocksField;
-    type Context = ();
-
-    #[inline(always)]
-    fn zero(_ctx: &mut Self::Context) -> Self {
-        Self([GoldilocksField::ZERO; 16])
-    }
-    #[inline(always)]
-    fn one(_ctx: &mut Self::Context) -> Self {
-        Self([GoldilocksField::ONE; 16])
-    }
-    #[inline(always)]
-    fn minus_one(_ctx: &mut Self::Context) -> Self {
-        Self([GoldilocksField::MINUS_ONE; 16])
-    }
-
-    #[inline(always)]
-    fn add_assign(&mut self, other: &Self, _ctx: &mut Self::Context) -> &mut Self {
-        Self::add_assign_impl(self, other)
-    }
-
-    #[inline(always)]
-    fn sub_assign(&'_ mut self, other: &Self, _ctx: &mut Self::Context) -> &mut Self {
-        Self::sub_assign_impl(self, other)
-    }
-
-    #[inline(always)]
-    #[unroll::unroll_for_loops]
-    fn mul_assign(&'_ mut self, other: &Self, _ctx: &mut Self::Context) -> &mut Self {
-        Self::mul_assign_impl(self, other)
-    }
-
-    #[inline(always)]
-    fn square(&'_ mut self, _ctx: &mut Self::Context) -> &'_ mut Self {
-        let t = *self;
-        self.mul_assign(&t, _ctx);
-
-        self
-    }
-
-    #[inline(always)]
-    #[unroll::unroll_for_loops]
-    fn negate(&'_ mut self, _ctx: &mut Self::Context) -> &'_ mut Self {
-        let mut a_u64 = Self::as_u64x4_arrays(self);
-
-        for i in 0..4 {
-            let a = a_u64.0[i];
-
-            let is_zero = a.eq(packed_simd::u64x4::splat(0));
-            let neg = packed_simd::u64x4::splat(Self::ORDER).sub(a);
-            let res = is_zero.select(a, neg);
-
-            a_u64.0[i] = res;
-        }
-
-        unsafe {
-            *self = Self::from_u64x4_arrays(a_u64);
-        }
-
-        self
-    }
-
-    #[inline(always)]
-    fn double(&'_ mut self, _ctx: &mut Self::Context) -> &'_ mut Self {
-        let t = *self;
-        self.add_assign(&t, _ctx);
-
-        self
-    }
-
-    #[inline(always)]
-    #[unroll::unroll_for_loops]
-    fn inverse(&self, _ctx: &mut Self::Context) -> Self {
-        let mut result = *self;
-        for i in 0..16 {
-            result.0[i] = PrimeField::inverse(&result.0[i]).expect("inverse must exist");
-        }
-
-        result
-    }
-
-    #[inline(always)]
-    fn constant(value: Self::Base, _ctx: &mut Self::Context) -> Self {
-        Self([value; 16])
-    }
-}
-
-impl crate::field::traits::field_like::PrimeFieldLikeVectorized for MixedGL {
-    type Twiddles<A: GoodAllocator> = Vec<GoldilocksField, A>;
-    type InverseTwiddles<A: GoodAllocator> = Vec<GoldilocksField, A>;
-    #[inline(always)]
-    fn is_zero(&self) -> bool {
-        self.0 == [GoldilocksField::ZERO; 16]
-    }
-
-    #[inline(always)]
-    fn equals(&self, other: &Self) -> bool {
-        self.eq(other)
-    }
-
-    #[inline(always)]
-    fn mul_all_by_base(&'_ mut self, other: &Self::Base, _ctx: &mut Self::Context) -> &'_ mut Self {
-        Self::mul_constant_assign(self, other)
-    }
-
-    #[inline(always)]
-    fn slice_from_base_slice(input: &[Self::Base]) -> &[Self] {
-        if input.len() < Self::SIZE_FACTOR {
-            panic!("too small input size to cast");
-        }
-        debug_assert!(input.len() % Self::SIZE_FACTOR == 0);
-        debug_assert!(input.as_ptr().addr() % std::mem::align_of::<Self>() == 0);
-        let result_len = input.len() / 16;
-        unsafe { std::slice::from_raw_parts(input.as_ptr() as *mut Self, result_len) }
-    }
-
-    #[inline(always)]
-    fn slice_into_base_slice(input: &[Self]) -> &[Self::Base] {
-        let result_len = input.len() * 16;
-        unsafe { std::slice::from_raw_parts(input.as_ptr() as *mut GoldilocksField, result_len) }
-    }
-
-    #[inline(always)]
-    fn slice_into_base_slice_mut(input: &mut [Self]) -> &mut [Self::Base] {
-        let result_len = input.len() * 16;
-        unsafe {
-            std::slice::from_raw_parts_mut(input.as_ptr() as *mut GoldilocksField, result_len)
-        }
-    }
-
-    #[inline(always)]
-    fn vec_from_base_vec<A: GoodAllocator>(input: Vec<Self::Base, A>) -> Vec<Self, A> {
-        if input.len() < Self::SIZE_FACTOR {
-            panic!("too small input size to cast");
-        }
-        let (ptr, len, capacity, allocator) = input.into_raw_parts_with_alloc();
-        debug_assert!(ptr.addr() % std::mem::align_of::<Self>() == 0);
-        debug_assert!(len % Self::SIZE_FACTOR == 0);
-        debug_assert!(capacity % Self::SIZE_FACTOR == 0);
-
-        unsafe {
-            Vec::from_raw_parts_in(
-                ptr as _,
-                len / Self::SIZE_FACTOR,
-                capacity / Self::SIZE_FACTOR,
-                allocator,
-            )
-        }
-    }
-
-    #[inline(always)]
-    fn vec_into_base_vec<A: GoodAllocator>(input: Vec<Self, A>) -> Vec<Self::Base, A> {
-        let (ptr, len, capacity, allocator) = input.into_raw_parts_with_alloc();
-
-        unsafe {
-            Vec::from_raw_parts_in(
-                ptr as _,
-                len * Self::SIZE_FACTOR,
-                capacity * Self::SIZE_FACTOR,
-                allocator,
-            )
-        }
-    }
-
-    #[inline(always)]
-    fn fft_natural_to_bitreversed<A: GoodAllocator>(
-        input: &mut [Self],
-        coset: Self::Base,
-        twiddles: &Self::Twiddles<A>,
-        _ctx: &mut Self::Context,
-    ) {
-        // let input = crate::utils::cast_check_alignment_ref_mut_unpack::<Self, GoldilocksField>(input);
-        // crate::fft::fft_natural_to_bitreversed_cache_friendly(input, coset, twiddles);
-
-        crate::fft::fft_natural_to_bitreversed_mixedgl(input, coset, twiddles);
-    }
-
-    #[inline(always)]
-    fn ifft_natural_to_natural<A: GoodAllocator>(
-        input: &mut [Self],
-        coset: Self::Base,
-        twiddles: &Self::InverseTwiddles<A>,
-        _ctx: &mut Self::Context,
-    ) {
-        // let input = crate::utils::cast_check_alignment_ref_mut_unpack::<Self, GoldilocksField>(input);
-        // crate::fft::ifft_natural_to_natural_cache_friendly(input, coset, twiddles);
-
-        crate::fft::ifft_natural_to_natural_mixedgl(input, coset, twiddles);
-    }
-
-    #[inline(always)]
-    fn precompute_forward_twiddles_for_fft<A: GoodAllocator>(
-        fft_size: usize,
-        worker: &Worker,
-        ctx: &mut Self::Context,
-    ) -> Self::Twiddles<A> {
-        precompute_twiddles_for_fft::<GoldilocksField, GoldilocksField, A, false>(
-            fft_size, worker, ctx,
-        )
-    }
-
-    #[inline(always)]
-    fn precompute_inverse_twiddles_for_fft<A: GoodAllocator>(
-        fft_size: usize,
-        worker: &Worker,
-        ctx: &mut Self::Context,
-    ) -> Self::Twiddles<A> {
-        precompute_twiddles_for_fft::<GoldilocksField, GoldilocksField, A, true>(
-            fft_size, worker, ctx,
-        )
-    }
-}
-
-#[cfg(test)]
-mod test {
-
-    use crate::field::goldilocks::MixedGL;
-    use crate::field::rand_from_rng;
-    use crate::field::traits::field_like::PrimeFieldLike;
-    use crate::field::traits::field_like::PrimeFieldLikeVectorized;
-    use crate::field::{goldilocks::GoldilocksField, Field};
-    use crate::utils::clone_respecting_allignment;
-
-    #[test]
-    fn test_mixedgl_negate() {
-        let mut ctx = ();
-        const POLY_SIZE: usize = 1 << 20;
-        let mut rng = rand::thread_rng();
-
-        // Generate random Vec<GoldilocksField>
-        let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
-
-        let mut ag = a.clone();
-
-        for aa in ag.iter_mut() {
-            Field::negate(aa);
-        }
-
-        let mut av: Vec<MixedGL> =
-            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
-                &a,
-            ));
-
-        // Test over GLPS
-        for aa in av.iter_mut() {
-            aa.negate(&mut ctx);
-        }
-
-        assert_eq!(MixedGL::vec_into_base_vec(av), ag);
-    }
-
-    use rand::Rng;
-
-    #[test]
-    fn test_mixedgl_add_assign() {
-        let mut ctx = ();
-        const POLY_SIZE: usize = 1 << 24;
-        let mut rng = rand::thread_rng();
-        let _s = GoldilocksField(0x0000000001000000);
-
-        // Generate random Vec<GoldilocksField>
-        // let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
-        // let b: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
-        // let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| GoldilocksField(0x0000000000000001)).collect();
-        // let b: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| GoldilocksField(0x0000000001000000)).collect();
-        let b: Vec<GoldilocksField> = (0..POLY_SIZE)
-            .map(|_| GoldilocksField(rng.gen_range(GoldilocksField::ORDER..u64::MAX)))
-            .collect();
-        let a: Vec<GoldilocksField> = (0..POLY_SIZE)
-            .map(|_| GoldilocksField(rng.gen_range(GoldilocksField::ORDER..u64::MAX)))
-            .collect();
-        // let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| GoldilocksField(0xfffffffff67f1442)).collect();
-        // let b: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| GoldilocksField(0xffffffff9c1d065d)).collect();
-
-        // dbg!(&a);
-        // dbg!(&b);
-
-        let mut ag = a.clone();
-        let bg = b.clone();
-
-        for (aa, bb) in ag.iter_mut().zip(bg.iter()) {
-            Field::add_assign(aa, bb);
-        }
-
-        let mut av: Vec<MixedGL> =
-            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
-                &a,
-            ));
-        let bv: Vec<MixedGL> =
-            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
-                &b,
-            ));
-
-        // Test over GLPS
-        for (aa, bb) in av.iter_mut().zip(bv.iter()) {
-            aa.add_assign(bb, &mut ctx);
-        }
-
-        let avv = MixedGL::vec_into_base_vec(av);
-        // for i in 0..avv.len() {
-        //     assert_eq!(avv[i], ag[i], "error {}", i);
-        // }
-
-        // dbg!(&ag[0]);
-        // dbg!(&avv[0]);
-
-        assert_eq!(avv, ag);
-    }
-
-    #[test]
-    fn test_mixedgl_sub_assign() {
-        let mut ctx = ();
-        const POLY_SIZE: usize = 1 << 20;
-        let _rng = rand::thread_rng();
-
-        // Generate random Vec<GoldilocksField>
-        // let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
-        // let b: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
-        let a: Vec<GoldilocksField> = (0..POLY_SIZE)
-            .map(|_| GoldilocksField(0x0000000000000001))
-            .collect();
-        let b: Vec<GoldilocksField> = (0..POLY_SIZE)
-            .map(|_| GoldilocksField(0x0000000001000000))
-            .collect();
-
-        // Test over Goldilocks
-        let mut ag = a.clone();
-        let bg = b.clone();
-
-        for (aa, bb) in ag.iter_mut().zip(bg.iter()) {
-            Field::sub_assign(aa, bb);
-        }
-
-        let mut av: Vec<MixedGL> =
-            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
-                &a,
-            ));
-        let bv: Vec<MixedGL> =
-            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
-                &b,
-            ));
-
-        // Test over GLPS
-        for (aa, bb) in av.iter_mut().zip(bv.iter()) {
-            aa.sub_assign(bb, &mut ctx);
-        }
-
-        // dbg!(&ag);
-        // dbg!(&av);
-
-        assert_eq!(ag, MixedGL::vec_into_base_vec(av));
-    }
-
-    #[test]
-    fn test_mixedgl_mul_assign() {
-        let mut ctx = ();
-        const POLY_SIZE: usize = 1 << 20;
-        let mut rng = rand::thread_rng();
-
-        // Generate random Vec<GoldilocksField>
-        let a: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
-        let b: Vec<GoldilocksField> = (0..POLY_SIZE).map(|_| rand_from_rng(&mut rng)).collect();
-
-        // Test over Goldilocks
-        let mut ag = a.clone();
-        let bg = b.clone();
-
-        for (aa, bb) in ag.iter_mut().zip(bg.iter()) {
-            Field::mul_assign(aa, bb);
-        }
-
-        let mut av: Vec<MixedGL> =
-            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
-                &a,
-            ));
-        let bv: Vec<MixedGL> =
-            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
-                &b,
-            ));
-
-        // Test over GLPS
-        for (aa, bb) in av.iter_mut().zip(bv.iter()) {
-            aa.mul_assign(bb, &mut ctx);
-        }
-
-        // dbg!(&ag);
-        // dbg!(&av);
-
-        assert_eq!(ag, MixedGL::vec_into_base_vec(av));
-    }
-
-    #[test]
-    fn test_mixedgl_butterfly16x16() {
-        // let mut ctx = ();
-
-        // let am: [u64;32] = [0x0001000000000000, 0x0000000000000001, 0x0001000000000000, 0x0000000000000001, 0x0000000000000000, 0xffffffff00000000, 0x0000000000000001, 0x0000ffffffffffff, 0x0000000000000000, 0x0001000000000000, 0xffffffff00000000, 0xffffffff00000000, 0xffffffff00000000, 0xfffeffff00000001, 0xfffeffff00000002, 0xfffeffff00000002,
-        //     0x0000000000000000, 0x0000000000000001, 0x0000000000000000, 0x0001000000000001, 0xfffeffff00000001, 0xffffffff00000000, 0x0001000000000000, 0xfffeffff00000002, 0x0000000000000000, 0xfffeffff00000001, 0xffffffff00000000, 0x0000000000000001, 0x0000ffffffffffff, 0x0000000000000000, 0x0000000000000001, 0x0001000000000000];
-
-        let am: [u64; 32] = [
-            0x0001000000000000,
-            0x0000000000000001,
-            0x0001000000000000,
-            0x0000000000000001,
-            0x0000000000000000,
-            0xffffffff00000000,
-            0x0000000000000001,
-            0x0000ffffffffffff,
-            0x0000000000000000,
-            0x0001000000000000,
-            0xffffffff00000000,
-            0xffffffff00000000,
-            0xffffffff00000000,
-            0xfffeffff00000001,
-            0xfffeffff00000002,
-            0xfffeffff00000002,
-            0x0000000000000000,
-            0xffffffff01000001,
-            0x0000000000000000,
-            0x0000010000ffff00,
-            0xfffffeff00000101,
-            0xfffffffeff000001,
-            0x000000ffffffff00,
-            0xfffffeff01000101,
-            0x0000000000000000,
-            0xfffffeff00000101,
-            0xfffffffeff000001,
-            0xffffffff01000001,
-            0x000000fffeffff00,
-            0x0000000000000000,
-            0xffffffff01000001,
-            0x000000ffffffff00,
-        ];
-
-        let a: Vec<GoldilocksField> = am.into_iter().map(GoldilocksField).collect();
-        // let b: Vec<GoldilocksField> = bm.into_iter().map(GoldilocksField).collect();
-        let _s = GoldilocksField(0x0000000001000000);
-
-        // Test over Goldilocks
-        let mut ag = a.clone();
-        // let mut bg = b.clone();
-        let distance_in_cache = 16;
-
-        let mut j = 0;
-        while j < 16 {
-            let mut u = ag[j];
-            let v = ag[j + distance_in_cache];
-            // Field::mul_assign(&mut v, &s);
-            Field::sub_assign(&mut u, &v);
-            ag[j + distance_in_cache] = u;
-            Field::add_assign(&mut ag[j], &v);
-
-            j += 1;
-        }
-
-        let av: Vec<MixedGL> =
-            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
-                &a,
-            ));
-        // let mut bv: Vec<MixedGL> = MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(&b));
-        // let mut av = av[0];
-        // let mut bv = bv[0];
-
-        // Test over MixedGL
-        // av[1].mul_constant_assign(&s);
-        unsafe {
-            MixedGL::butterfly_16x16_impl(
-                av[0].0.as_ptr() as *mut u64,
-                av[1].0.as_ptr() as *mut u64,
-            );
-        }
-        // let mut u = av[0];
-        // let mut v = av[1];
-        // unsafe { MixedGL::butterfly_16x16_impl(&mut u, &mut v); }
-        // av[0] = u;
-        // av[1] = v;
-
-        let ag =
-            MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(
-                &ag,
-            ));
-        // let bg = MixedGL::vec_from_base_vec(clone_respecting_allignment::<GoldilocksField, MixedGL, _>(&bg));
-
-        dbg!(&ag);
-        dbg!(&av);
-
-        // dbg!(&bg);
-        // dbg!(&bv);
-
-        assert_eq!(ag, av);
-        // assert_eq!(bg, bv);
-    }
-}
diff --git a/src/field/goldilocks/mod.rs b/src/field/goldilocks/mod.rs
index 82fa6be..26f5382 100644
--- a/src/field/goldilocks/mod.rs
+++ b/src/field/goldilocks/mod.rs
@@ -12,18 +12,11 @@ mod extension;
 mod inversion;
 
 #[cfg(all(
-    not(feature = "include_packed_simd"),
     any(target_feature = "neon", target_feature = "avx2"),
     not(all(target_feature = "avx512f", target_feature = "avx512vl"))
 ))]
 pub mod arm_asm_impl;
 
-#[cfg(all(
-    feature = "include_packed_simd",
-    any(target_feature = "neon", target_feature = "avx2"),
-    not(all(target_feature = "avx512f", target_feature = "avx512vl"))
-))]
-pub mod arm_asm_packed_impl;
 #[cfg(not(any(
     all(target_feature = "avx512f", target_feature = "avx512vl"),
     target_feature = "neon",
@@ -51,19 +44,11 @@ pub mod x86_64_asm_impl;
 pub mod avx512_impl;
 
 #[cfg(all(
-    not(feature = "include_packed_simd"),
     any(target_feature = "neon", target_feature = "avx2"),
     not(all(target_feature = "avx512f", target_feature = "avx512vl"))
 ))]
 pub use arm_asm_impl::*;
 
-#[cfg(all(
-    feature = "include_packed_simd",
-    any(target_feature = "neon", target_feature = "avx2"),
-    not(all(target_feature = "avx512f", target_feature = "avx512vl"))
-))]
-pub use arm_asm_packed_impl::*;
-
 #[cfg(not(any(
     all(target_feature = "avx512f", target_feature = "avx512vl"),
     target_feature = "neon",
diff --git a/src/implementations/poseidon2/mod.rs b/src/implementations/poseidon2/mod.rs
index 28605cb..b3df999 100644
--- a/src/implementations/poseidon2/mod.rs
+++ b/src/implementations/poseidon2/mod.rs
@@ -4,50 +4,15 @@ use crate::field::goldilocks::GoldilocksField;
 pub mod params;
 
 pub mod state_generic_impl;
-#[cfg(not(any(
-    all(
-        target_feature = "avx512bw",
-        target_feature = "avx512cd",
-        target_feature = "avx512dq",
-        target_feature = "avx512f",
-        target_feature = "avx512vl",
-    ),
-    all(
-        feature = "include_packed_simd",
-        any(target_feature = "neon", target_feature = "avx2")
-    )
+#[cfg(not(all(
+    target_feature = "avx512bw",
+    target_feature = "avx512cd",
+    target_feature = "avx512dq",
+    target_feature = "avx512f",
+    target_feature = "avx512vl",
 )))]
 pub use state_generic_impl::*;
 
-// Other poseidon implementations depend on packed_simd 128
-// which is no longer available in std::simd (and packed_simd is no longer
-// supported in the newest rust nightly).
-#[cfg(all(
-    feature = "include_packed_simd",
-    any(target_feature = "neon", target_feature = "avx2"),
-    not(any(
-        target_feature = "avx512bw",
-        target_feature = "avx512cd",
-        target_feature = "avx512dq",
-        target_feature = "avx512f",
-        target_feature = "avx512vl"
-    ))
-))]
-pub mod state_vectorized_double;
-
-#[cfg(all(
-    feature = "include_packed_simd",
-    any(target_feature = "neon", target_feature = "avx2"),
-    not(any(
-        target_feature = "avx512bw",
-        target_feature = "avx512cd",
-        target_feature = "avx512dq",
-        target_feature = "avx512f",
-        target_feature = "avx512vl"
-    ))
-))]
-pub use state_vectorized_double::*;
-
 #[cfg(all(
     target_feature = "avx512bw",
     target_feature = "avx512cd",
diff --git a/src/implementations/poseidon2/state_generic_impl.rs b/src/implementations/poseidon2/state_generic_impl.rs
index c9b74e8..1d067a9 100644
--- a/src/implementations/poseidon2/state_generic_impl.rs
+++ b/src/implementations/poseidon2/state_generic_impl.rs
@@ -29,10 +29,6 @@ impl State {
     pub const T: u64 = (Self::ORDER - 1) >> Self::TWO_ADICITY;
     pub const BARRETT: u128 = 18446744078004518912; // 0x10000000100000000
     pub const EPSILON: u64 = (1 << 32) - 1;
-    #[cfg(feature = "include_packed_simd")]
-    pub const EPSILON_VECTOR: packed_simd::u64x4 = packed_simd::u64x4::splat(Self::EPSILON);
-    #[cfg(feature = "include_packed_simd")]
-    pub const EPSILON_VECTOR_D: packed_simd::u64x8 = packed_simd::u64x8::splat(Self::EPSILON);
 
     pub const RATE: usize = poseidon_goldilocks_params::RATE;
     pub const CAPACITY: usize = poseidon_goldilocks_params::CAPACITY;
diff --git a/src/implementations/poseidon2/state_vectorized_double.rs b/src/implementations/poseidon2/state_vectorized_double.rs
deleted file mode 100644
index 94e8aae..0000000
--- a/src/implementations/poseidon2/state_vectorized_double.rs
+++ /dev/null
@@ -1,415 +0,0 @@
-//! A vectorized implementation of the poseidon2 state.
-use crate::field::Field;
-use std::ops::{Add, Mul, Shl};
-use std::usize;
-use unroll::unroll_for_loops;
-
-use crate::field::goldilocks::GoldilocksField;
-use crate::field::traits::representation::U64Representable;
-
-use super::poseidon_goldilocks_params;
-
-#[derive(Default, PartialEq, Eq, Hash, Clone, Copy)]
-#[repr(C, align(64))]
-pub struct State(pub [u128; 12]);
-
-// we also need holder for SIMD targets, because u64x4 has smaller alignment than u64x8
-#[derive(Clone, Copy)]
-#[repr(C, align(64))]
-struct U128x4Holder([packed_simd::u128x4; 3]);
-
-impl std::fmt::Debug for State {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:?}", self.0)
-    }
-}
-
-impl std::fmt::Display for State {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:?}", self.0)
-    }
-}
-
-impl State {
-    pub const ORDER_BITS: usize = GoldilocksField::ORDER_BITS;
-    pub const ORDER: u128 = GoldilocksField::ORDER as u128;
-    pub const TWO_ADICITY: usize = GoldilocksField::TWO_ADICITY;
-    pub const T: u128 = (Self::ORDER - 1) >> Self::TWO_ADICITY;
-    pub const BARRETT: u128 = 18446744078004518912; // 0x10000000100000000
-    pub const EPSILON: u128 = (1 << 32) - 1;
-    pub const EPSILON_VECTOR: packed_simd::u128x4 = packed_simd::u128x4::splat(Self::EPSILON);
-
-    pub const RATE: usize = poseidon_goldilocks_params::RATE;
-    pub const CAPACITY: usize = poseidon_goldilocks_params::CAPACITY;
-    pub const STATE_WIDTH: usize = poseidon_goldilocks_params::STATE_WIDTH;
-    pub const HALF_NUM_FULL_ROUNDS: usize = poseidon_goldilocks_params::HALF_NUM_FULL_ROUNDS;
-    pub const NUM_FULL_ROUNDS_TOTAL: usize = poseidon_goldilocks_params::NUM_FULL_ROUNDS_TOTAL;
-    pub const NUM_PARTIAL_ROUNDS: usize = poseidon_goldilocks_params::NUM_PARTIAL_ROUNDS;
-    pub const TOTAL_NUM_ROUNDS: usize = poseidon_goldilocks_params::TOTAL_NUM_ROUNDS;
-    pub const ALL_ROUND_CONSTANTS: [Self; Self::TOTAL_NUM_ROUNDS] = const {
-        let mut constants_array = [Self([0u128; Self::STATE_WIDTH]); Self::TOTAL_NUM_ROUNDS];
-        let mut i = 0;
-        while i < Self::TOTAL_NUM_ROUNDS {
-            let mut t = [0u128; 12];
-            let mut j = 0;
-            while j < 12 {
-                t[j] = poseidon_goldilocks_params::ALL_ROUND_CONSTANTS[i * Self::STATE_WIDTH + j]
-                    as u128;
-                j += 1;
-            }
-            constants_array[i] = Self(t);
-            i += 1;
-        }
-        constants_array
-    };
-
-    pub const ALL_INNER_ROUND_CONSTANTS: [u128; Self::TOTAL_NUM_ROUNDS] = const {
-        let mut constants_array = [0u128; Self::TOTAL_NUM_ROUNDS];
-        let mut i = 0;
-        while i < Self::TOTAL_NUM_ROUNDS {
-            constants_array[i] =
-                poseidon_goldilocks_params::ALL_ROUND_CONSTANTS[i * Self::STATE_WIDTH] as u128;
-            i += 1;
-        }
-        constants_array
-    };
-
-    pub const M_I_DIAGONAL_ELEMENTS_POWS: [packed_simd::u128x4; 3] = [
-        packed_simd::u128x4::new(4, 14, 11, 8),
-        packed_simd::u128x4::new(0, 5, 2, 9),
-        packed_simd::u128x4::new(13, 6, 3, 12),
-    ];
-
-    pub const M_I_DIAGONAL_ELEMENTS: [packed_simd::u128x4; 3] = [
-        packed_simd::u128x4::new(1 << 4, 1 << 14, 1 << 11, 1 << 8),
-        packed_simd::u128x4::new(1 << 0, 1 << 5, 1 << 2, 1 << 9),
-        packed_simd::u128x4::new(1 << 13, 1 << 6, 1 << 3, 1 << 12),
-    ];
-
-    #[inline(always)]
-    pub fn new() -> Self {
-        Self([0u128; 12])
-    }
-
-    #[inline(always)]
-    pub const fn from_u128_array(value: [u128; 12]) -> Self {
-        Self(value)
-    }
-
-    #[inline(always)]
-    #[unroll_for_loops]
-    pub fn to_reduced(&mut self) -> &mut Self {
-        let mut a_u64 = Self::as_u128x4_arrays(self);
-
-        for i in 0..3 {
-            let a = a_u64.0[i];
-            let a_reduced = a.add(Self::EPSILON_VECTOR);
-            let cmp = a_reduced.lt(Self::EPSILON_VECTOR);
-            let res = cmp.select(a_reduced, a);
-
-            a_u64.0[i] = res;
-        }
-
-        *self = Self::from_u128x4_arrays(a_u64);
-        self
-    }
-
-    #[inline(always)]
-    #[unroll_for_loops]
-    fn mul_assign_impl_without_prereduction(&mut self, other: &Self) -> &mut Self {
-        for i in 0..12 {
-            let c = self.0[i] * other.0[i];
-            self.0[i] = GoldilocksField::from_u128_with_reduction(c).as_u64() as u128;
-        }
-
-        self
-    }
-
-    #[inline(always)]
-    pub fn from_field_array(input: [GoldilocksField; 12]) -> Self {
-        let mut d = Self::new();
-        for i in 0..12 {
-            d.0[i] = input[i].as_u64() as u128;
-        }
-        d
-    }
-
-    #[inline(always)]
-    pub fn as_field_array(self) -> [GoldilocksField; 12] {
-        let mut d = [GoldilocksField::ZERO; 12];
-        for i in 0..12 {
-            d[i] = GoldilocksField::from_u128_with_reduction(self.0[i]);
-        }
-        d
-    }
-
-    #[inline(always)]
-    fn as_u128x4_arrays(input: &Self) -> U128x4Holder {
-        // this preserves an alignment
-        unsafe { std::mem::transmute(*input) }
-    }
-
-    #[inline(always)]
-    fn from_u128x4_arrays(input: U128x4Holder) -> Self {
-        // this preserves an alignment
-        unsafe { std::mem::transmute(input) }
-    }
-
-    //vectorized mds_mul
-    #[inline(always)]
-    #[unroll_for_loops]
-    pub fn suggested_mds_mul(&mut self) {
-        //do we need them permanently permuted?
-        let x0 = packed_simd::u128x4::new(self.0[0], self.0[4], self.0[8], 0u128);
-        let x1 = packed_simd::u128x4::new(self.0[1], self.0[5], self.0[9], 0u128);
-        let x2 = packed_simd::u128x4::new(self.0[2], self.0[6], self.0[10], 0u128);
-        let x3 = packed_simd::u128x4::new(self.0[3], self.0[7], self.0[11], 0u128);
-
-        let t0 = x0.add(x1);
-        let t1 = x2.add(x3);
-        let x1d = x1.shl(1);
-        let x3d = x3.shl(1);
-        let t2 = x1d.add(t1);
-        let t3 = x3d.add(t0);
-        let t0q = t0.shl(2);
-        let t1q = t1.shl(2);
-        let t4 = t1q.add(t3);
-        let t5 = t0q.add(t2);
-        let t6 = t3.add(t5);
-        let t7 = t2.add(t4);
-
-        let y0 = t6.add(t6.wrapping_sum());
-        let y1 = t5.add(t5.wrapping_sum());
-        let y2 = t7.add(t7.wrapping_sum());
-        let y3 = t4.add(t4.wrapping_sum());
-
-        let mut y = Self::new();
-        for i in 0..3 {
-            y.0[i * 4] = y0.extract(i);
-            y.0[i * 4 + 1] = y1.extract(i);
-            y.0[i * 4 + 2] = y2.extract(i);
-            y.0[i * 4 + 3] = y3.extract(i);
-        }
-
-        *self = y;
-    }
-
-    #[inline(always)]
-    #[unroll_for_loops]
-    pub fn apply_round_constants(&mut self, round: usize) {
-        let const_current = Self::ALL_ROUND_CONSTANTS[round];
-        let const_u64 = Self::as_u128x4_arrays(&const_current);
-        let mut state_u64 = Self::as_u128x4_arrays(self);
-        for i in 0..3 {
-            state_u64.0[i] = state_u64.0[i].add(const_u64.0[i]);
-        }
-        *self = Self::from_u128x4_arrays(state_u64);
-    }
-
-    #[inline(always)]
-    #[unroll_for_loops]
-    pub fn apply_non_linearity(&mut self) {
-        for i in 0..12 {
-            self.0[i] = GoldilocksField::from_u128_with_reduction(self.0[i]).as_u64() as u128;
-        }
-        let mut t = *self;
-        self.elementwise_square();
-        t.elementwise_mul_assign(&*self);
-        self.elementwise_square();
-        self.elementwise_mul_assign(&t);
-    }
-
-    #[inline(always)]
-    fn elementwise_mul_assign(&mut self, other: &Self) {
-        Self::mul_assign_impl_without_prereduction(self, other);
-    }
-
-    #[inline(always)]
-    fn elementwise_square(&mut self) {
-        let t = *self;
-        self.elementwise_mul_assign(&t);
-    }
-
-    #[inline(always)]
-    fn full_round(&mut self, round_counter: &mut usize) {
-        // add constants
-        self.apply_round_constants(*round_counter);
-        // apply non-linearity
-        self.apply_non_linearity();
-        // multiply by MDS
-        self.suggested_mds_mul();
-
-        *round_counter += 1;
-    }
-
-    #[inline(always)]
-    #[unroll_for_loops]
-    pub fn m_i_mul(&mut self) {
-        let mut state_u64 = Self::as_u128x4_arrays(self);
-        let mut rowwise_sum = 0u128;
-        for i in 0..3 {
-            rowwise_sum += state_u64.0[i].wrapping_sum();
-        }
-
-        for i in 0..3 {
-            state_u64.0[i] = state_u64.0[i].mul(Self::M_I_DIAGONAL_ELEMENTS[i]);
-            state_u64.0[i] = state_u64.0[i].add(rowwise_sum);
-        }
-
-        *self = Self::from_u128x4_arrays(state_u64);
-    }
-
-    #[inline(always)]
-    fn partial_round_poseidon2(&mut self, round_counter: &mut usize) {
-        // add constant
-        use std::ops::AddAssign;
-        self.0[0].add_assign(&Self::ALL_INNER_ROUND_CONSTANTS[*round_counter]);
-        // apply non-linearity to the single element
-        let mut s = GoldilocksField::from_u128_with_reduction(self.0[0]);
-        let mut t = s;
-        s.square();
-        t.mul_assign(&s);
-        s.square();
-        s.mul_assign(&t);
-        self.0[0] = s.as_u64() as u128;
-
-        // multiply by MDS
-        self.m_i_mul();
-
-        *round_counter += 1;
-    }
-
-    #[inline(always)]
-    #[unroll_for_loops]
-    pub fn poseidon2_permutation(&mut self) {
-        self.suggested_mds_mul();
-        let mut round_counter = 0;
-        for _i in 0..4 {
-            self.full_round(&mut round_counter);
-        }
-        for i in 0..22 {
-            self.partial_round_poseidon2(&mut round_counter);
-
-            if i % 3 == 1 {
-                for j in 0..12 {
-                    self.0[j] =
-                        GoldilocksField::from_u128_with_reduction(self.0[j]).as_u64() as u128;
-                }
-            }
-        }
-        for _i in 0..4 {
-            self.full_round(&mut round_counter);
-        }
-
-        for i in 0..12 {
-            self.0[i] = GoldilocksField::from_u128_with_reduction(self.0[i]).as_u64() as u128;
-        }
-    }
-}
-
-#[inline(always)]
-pub fn poseidon2_permutation(state: &mut [GoldilocksField; State::STATE_WIDTH]) {
-    let mut state_vec = State::from_field_array(*state);
-    state_vec.poseidon2_permutation();
-    *state = state_vec.as_field_array();
-}
-
-#[cfg(test)]
-mod test {
-
-    use crate::field::rand_from_rng;
-    use crate::field::{goldilocks::GoldilocksField, Field};
-    use crate::implementations::poseidon2::State;
-    use crate::implementations::poseidon_goldilocks_naive;
-    use crate::implementations::suggested_mds;
-
-    //test for apply_round_constants
-    #[test]
-    fn test_apply_round_constants() {
-        let mut rng = rand::thread_rng();
-        let mut state = [GoldilocksField::ONE; 12];
-
-        for i in 0..state.len() {
-            state[i] = rand_from_rng(&mut rng);
-        }
-        dbg!(state);
-
-        let mut state_ref = state;
-        poseidon_goldilocks_naive::apply_round_constants(&mut state_ref, 0);
-
-        let mut state_vec = State::from_field_array(state);
-        state_vec.apply_round_constants(0);
-
-        // dbg!(&state_vec);
-
-        assert_eq!(state_ref, state_vec.as_field_array());
-    }
-
-    //test for apply_non_linearity
-    #[test]
-    fn test_apply_non_linearity() {
-        let mut rng = rand::thread_rng();
-        let mut state = [GoldilocksField::ONE; 12];
-
-        for i in 0..state.len() {
-            state[i] = rand_from_rng(&mut rng);
-        }
-        dbg!(state);
-
-        let mut state_ref = state;
-        for i in 0..12 {
-            poseidon_goldilocks_naive::apply_non_linearity(&mut state_ref[i]);
-        }
-
-        let mut state_vec = State::from_field_array(state);
-        state_vec.apply_non_linearity();
-
-        // dbg!(&state_vec);
-
-        assert_eq!(state_ref, state_vec.as_field_array());
-    }
-
-    //test for suggested_mds_mul
-    #[test]
-    fn test_suggested_mds_mul() {
-        let mut rng = rand::thread_rng();
-        let mut state = [GoldilocksField::ONE; 12];
-
-        for i in 0..state.len() {
-            state[i] = rand_from_rng(&mut rng);
-        }
-        dbg!(state);
-
-        let mut state_ref = state;
-        suggested_mds::suggested_mds_mul(&mut state_ref);
-
-        let mut state_vec = State::from_field_array(state);
-        state_vec.suggested_mds_mul();
-
-        // dbg!(&state_vec);
-
-        assert_eq!(state_ref, state_vec.as_field_array());
-    }
-
-    //test for poseidon2_permutation
-    #[test]
-    fn test_poseidon2_permutation() {
-        let mut rng = rand::thread_rng();
-        let mut state = [GoldilocksField::ONE; 12];
-
-        for i in 0..state.len() {
-            state[i] = rand_from_rng(&mut rng);
-        }
-
-        let state = [GoldilocksField(GoldilocksField::ORDER - 1); 12];
-        dbg!(state);
-
-        let mut state_ref = State::from_field_array(state);
-        State::poseidon2_permutation(&mut state_ref);
-
-        let mut state_vec = State::from_field_array(state);
-        state_vec.poseidon2_permutation();
-
-        assert_eq!(state_ref, state_vec);
-    }
-}
diff --git a/src/lib.rs b/src/lib.rs
index 6978fa7..a8bdc38 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -17,6 +17,7 @@
     clippy::needless_pass_by_ref_mut, // Mutable references are often used indirectly (e.g. via unsafe code).
     clippy::int_plus_one, // Suggests less expressive code.
     clippy::bool_assert_comparison, // This crate prefers explicitness.
+    clippy::derived_hash_with_manual_eq,
 )]
 #![allow(dead_code)]
 #![allow(dropping_references)] // Required to explicitly show that mutable references are dropped.
@@ -53,7 +54,6 @@
 #![feature(vec_push_within_capacity)]
 #![feature(return_position_impl_trait_in_trait)]
 #![feature(type_changing_struct_update)]
-#![cfg_attr(feature = "include_packed_simd", feature(stdsimd))]
 
 pub mod algebraic_props;
 pub mod config;