Merge pull request #973 from andyleiserson/transpose

Bit matrix transposes for converting to/from vectorized shares
private-attribution · Mar 19, 2024 · f707f06 · f707f06
2 parents d4b18bb + a4a2aa2
commit f707f06
Show file tree

Hide file tree

Showing 11 changed files with 1,142 additions and 36 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,6 +6,10 @@ members = ["ipa-core", "ipa-macros"]
 incremental = true
 lto = "thin"
 
+[profile.release-max]
+inherits = "release"
+codegen-units = 1
+
 [profile.bench-dhat]
 inherits = "bench"
 incremental = true

diff --git a/ipa-core/Cargo.toml b/ipa-core/Cargo.toml
@@ -216,6 +216,11 @@ path = "benches/oneshot/ipa.rs"
 harness = false
 required-features = ["enable-benches", "descriptive-gate"]
 
+[[bench]]
+name = "transpose"
+harness = false
+required-features = ["enable-benches"]
+
 [[test]]
 name = "helper_networks"
 required-features = [

diff --git a/ipa-core/benches/transpose.rs b/ipa-core/benches/transpose.rs
@@ -0,0 +1,114 @@
+//! Benchmarks for bit matrix transpose operations.
+//!
+//! Some of these routines run very fast, which doesn't work well with the default Criterion settings.
+//! The warm up time and measurement time are reduced, because the defaults will produce a very large
+//! number of samples (which in turn will take Criterion a long time to analyze).
+//!
+//! Some of the benchmark routines are looped so that the running time is long enough for Criterion
+//! to measure reliably. When too short, Criterion complains that some measurements take zero time.
+//! Presumably, the behavior of the underlying system clock is a contributing factor here.
+//!
+//! There is also a panic in the `plotters` crate used by Criterion to produce HTML reports that can
+//! occur with very fast-running routines. This can be worked around by passing the `-n` option to
+//! Criterion to disable HTML reports.
+
+use std::{array, iter::repeat_with, time::Duration};
+
+use criterion::{criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion, Throughput};
+use ipa_core::{
+    error::UnwrapInfallible,
+    ff::boolean_array::BA64,
+    secret_sharing::{
+        vector::{transpose_16x16, transpose_8x8},
+        SharedValue, TransposeFrom,
+    },
+};
+use rand::{
+    distributions::{Distribution, Standard},
+    thread_rng, Rng,
+};
+
+fn random_array<T, const N: usize>() -> [T; N]
+where
+    Standard: Distribution<T>,
+{
+    let mut rng = thread_rng();
+    array::from_fn(|_| rng.gen())
+}
+
+struct Params {
+    rows: usize,
+    cols: usize,
+    iters: usize,
+}
+
+fn do_benchmark<O, T, const N: usize>(
+    c: &mut Criterion,
+    Params { rows, cols, iters }: Params,
+    routine: fn(&[T; N]) -> O,
+) where
+    Standard: Distribution<T>,
+{
+    let mut group = c.benchmark_group(format!("{rows}x{cols}"));
+    group.warm_up_time(Duration::from_millis(200));
+    group.measurement_time(Duration::from_millis(200));
+    group.throughput(Throughput::Elements((rows * cols * iters) as u64));
+
+    group.bench_with_input(
+        BenchmarkId::new("transpose", format!("{iters}x")),
+        &(),
+        move |b, _| {
+            b.iter_batched_ref(
+                || repeat_with(random_array).take(iters).collect::<Vec<_>>(),
+                |input| input.iter().map(routine).count(),
+                BatchSize::SmallInput,
+            )
+        },
+    );
+    group.finish();
+}
+
+fn bench_8x8(c: &mut Criterion) {
+    do_benchmark(
+        c,
+        Params {
+            rows: 8,
+            cols: 8,
+            iters: 100,
+        },
+        |m| transpose_8x8(m),
+    );
+}
+
+fn bench_16x16(c: &mut Criterion) {
+    do_benchmark(
+        c,
+        Params {
+            rows: 16,
+            cols: 16,
+            iters: 50,
+        },
+        transpose_16x16,
+    );
+}
+
+fn bench_64x64(c: &mut Criterion) {
+    do_benchmark(
+        c,
+        Params {
+            rows: 64,
+            cols: 64,
+            iters: 1,
+        },
+        |src| {
+            let mut dst = array::from_fn(|_| BA64::ZERO);
+            dst.transpose_from(src).unwrap_infallible();
+            dst
+        },
+    );
+}
+
+criterion_group!(benches_8x8, bench_8x8);
+criterion_group!(benches_16x16, bench_16x16);
+criterion_group!(benches_64x64, bench_64x64);
+criterion_main!(benches_8x8, benches_16x16, benches_64x64);
diff --git a/ipa-core/src/ff/boolean_array.rs b/ipa-core/src/ff/boolean_array.rs
@@ -269,6 +269,18 @@ macro_rules! boolean_array_impl {
             impl $name {
                 #[cfg(all(test, unit_test))]
                 const STORE_LEN: usize = bitvec::mem::elts::<u8>($bits);
+
+                #[inline]
+                #[must_use]
+                pub fn as_raw_slice(&self) -> &[u8] {
+                    self.0.as_raw_slice()
+                }
+
+                #[inline]
+                #[must_use]
+                pub fn as_raw_mut_slice(&mut self) -> &mut [u8] {
+                    self.0.as_raw_mut_slice()
+                }
             }
 
             impl ArrayAccess for $name {
@@ -736,12 +748,21 @@ boolean_array_impl_small!(boolean_array_5, BA5, 5, fallible);
 boolean_array_impl_small!(boolean_array_6, BA6, 6, fallible);
 boolean_array_impl_small!(boolean_array_7, BA7, 7, fallible);
 boolean_array_impl_small!(boolean_array_8, BA8, 8, infallible);
+boolean_array_impl_small!(boolean_array_16, BA16, 16, infallible);
 boolean_array_impl_small!(boolean_array_20, BA20, 20, fallible);
 boolean_array_impl_small!(boolean_array_32, BA32, 32, infallible);
 boolean_array_impl_small!(boolean_array_64, BA64, 64, infallible);
 boolean_array_impl_small!(boolean_array_112, BA112, 112, infallible);
 boolean_array_impl!(boolean_array_256, BA256, 256, infallible);
 
+impl Vectorizable<256> for BA64 {
+    type Array = StdArray<BA64, 256>;
+}
+
+impl Vectorizable<256> for BA256 {
+    type Array = StdArray<BA256, 256>;
+}
+
 // used to convert into Fp25519
 impl From<(u128, u128)> for BA256 {
     fn from(value: (u128, u128)) -> Self {

diff --git a/ipa-core/src/secret_sharing/decomposed.rs b/ipa-core/src/secret_sharing/decomposed.rs
@@ -1,4 +1,8 @@
-use std::{fmt::Debug, ops::Deref, slice};
+use std::{
+    fmt::Debug,
+    ops::{Deref, DerefMut},
+    slice,
+};
 
 use crate::{
     error::Error,
@@ -12,7 +16,7 @@ pub struct BitDecomposed<S> {
 }
 
 impl<S> BitDecomposed<S> {
-    const MAX: usize = 64;
+    const MAX: usize = 256;
 
     /// Create a new value from an iterator.
     /// # Panics
@@ -99,6 +103,12 @@ impl<S> BitDecomposed<S> {
     }
 }
 
+impl<S: Clone> BitDecomposed<S> {
+    pub fn resize(&mut self, new_len: usize, value: S) {
+        self.bits.resize(new_len, value);
+    }
+}
+
 impl<S> TryFrom<Vec<S>> for BitDecomposed<S> {
     type Error = Error;
     fn try_from(bits: Vec<S>) -> Result<Self, Self::Error> {
@@ -148,6 +158,12 @@ impl<S> Deref for BitDecomposed<S> {
     }
 }
 
+impl<S> DerefMut for BitDecomposed<S> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.bits
+    }
+}
+
 impl<S> IntoIterator for BitDecomposed<S> {
     type Item = S;
     type IntoIter = <Vec<S> as IntoIterator>::IntoIter;

diff --git a/ipa-core/src/secret_sharing/mod.rs b/ipa-core/src/secret_sharing/mod.rs
@@ -3,7 +3,10 @@ pub mod replicated;
 mod decomposed;
 mod into_shares;
 mod scheme;
+#[cfg(not(feature = "enable-benches"))]
 mod vector;
+#[cfg(feature = "enable-benches")]
+pub mod vector;
 
 use std::{
     fmt::Debug,
@@ -20,7 +23,8 @@ use rand::{
 };
 pub use scheme::{Bitwise, Linear, LinearRefOps, SecretSharing};
 pub use vector::{
-    FieldArray, FieldSimd, FieldVectorizable, SharedValueArray, StdArray, Vectorizable,
+    FieldArray, FieldSimd, FieldVectorizable, SharedValueArray, StdArray, TransposeFrom,
+    Vectorizable,
 };
 
 #[cfg(any(test, feature = "test-fixture", feature = "cli"))]

diff --git a/ipa-core/src/secret_sharing/replicated/semi_honest/additive_share.rs b/ipa-core/src/secret_sharing/replicated/semi_honest/additive_share.rs
@@ -102,6 +102,28 @@ impl<V: SharedValue + Vectorizable<N>, const N: usize> AdditiveShare<V, N> {
     pub fn right_arr(&self) -> &<V as Vectorizable<N>>::Array {
         &self.1
     }
+
+    pub(in crate::secret_sharing) fn left_arr_mut(&mut self) -> &mut <V as Vectorizable<N>>::Array {
+        &mut self.0
+    }
+
+    pub(in crate::secret_sharing) fn right_arr_mut(
+        &mut self,
+    ) -> &mut <V as Vectorizable<N>>::Array {
+        &mut self.1
+    }
+
+    pub fn into_arr_tuple(self) -> (<V as Vectorizable<N>>::Array, <V as Vectorizable<N>>::Array) {
+        let Self(left, right) = self;
+        (left, right)
+    }
+
+    pub fn from_fns<LF: FnMut(usize) -> V, RF: FnMut(usize) -> V>(lf: LF, rf: RF) -> Self {
+        Self(
+            <V as Vectorizable<N>>::Array::from_fn(lf),
+            <V as Vectorizable<N>>::Array::from_fn(rf),
+        )
+    }
 }
 
 impl<V: SharedValue> AdditiveShare<V>

diff --git a/ipa-core/src/secret_sharing/vector/array.rs b/ipa-core/src/secret_sharing/vector/array.rs
@@ -6,12 +6,12 @@ use std::{
 };
 
 use generic_array::{ArrayLength, GenericArray};
-use typenum::U32;
+use typenum::{U16, U256, U32, U64};
 
 use crate::{
     error::LengthError,
     ff::{Field, Fp32BitPrime, Serializable},
-    protocol::prss::{FromRandom, FromRandomU128},
+    protocol::prss::FromRandom,
     secret_sharing::{FieldArray, Sendable, SharedValue, SharedValueArray},
 };
 
@@ -277,14 +277,24 @@ impl<F: SharedValue + FromRandom> FromRandom for StdArray<F, 1> {
     }
 }
 
-impl FromRandom for StdArray<Fp32BitPrime, 32> {
-    type SourceLength = U32;
+macro_rules! impl_from_random {
+    ($value_ty:ty, $width:expr, $source_len:ty, $item_len:expr) => {
+        impl FromRandom for StdArray<$value_ty, $width> {
+            type SourceLength = $source_len;
 
-    fn from_random(src: GenericArray<u128, U32>) -> Self {
-        Self(array::from_fn(|i| Fp32BitPrime::from_random_u128(src[i])))
-    }
+            fn from_random(src: GenericArray<u128, Self::SourceLength>) -> Self {
+                Self(array::from_fn(|i| {
+                    <$value_ty>::from_random(
+                        GenericArray::from_slice(&src[$item_len * i..$item_len * (i + 1)]).clone(),
+                    )
+                }))
+            }
+        }
+    };
 }
 
+impl_from_random!(Fp32BitPrime, 32, U32, 1);
+
 impl<V: SharedValue> Serializable for StdArray<V, 1> {
     type Size = <V as Serializable>::Size;
     type DeserializationError = <V as Serializable>::DeserializationError;
@@ -298,34 +308,45 @@ impl<V: SharedValue> Serializable for StdArray<V, 1> {
     }
 }
 
-impl<V: SharedValue> Serializable for StdArray<V, 32>
-where
-    V: SharedValue,
-    <V as Serializable>::Size: Mul<U32>,
-    <<V as Serializable>::Size as Mul<U32>>::Output: ArrayLength,
-{
-    type Size = <<V as Serializable>::Size as Mul<U32>>::Output;
-    type DeserializationError = <V as Serializable>::DeserializationError;
-
-    fn serialize(&self, buf: &mut GenericArray<u8, Self::Size>) {
-        let sz: usize = (<V as SharedValue>::BITS / 8).try_into().unwrap();
-        for i in 0..32 {
-            self.0[i].serialize(
-                GenericArray::try_from_mut_slice(&mut buf[sz * i..sz * (i + 1)]).unwrap(),
-            );
-        }
-    }
-
-    fn deserialize(buf: &GenericArray<u8, Self::Size>) -> Result<Self, Self::DeserializationError> {
-        let sz: usize = (<V as SharedValue>::BITS / 8).try_into().unwrap();
-        let mut res = [V::ZERO; 32];
-        for i in 0..32 {
-            res[i] = V::deserialize(GenericArray::from_slice(&buf[sz * i..sz * (i + 1)]))?;
+macro_rules! impl_serializable {
+    ($width:expr, $width_ty:ty) => {
+        impl<V: SharedValue> Serializable for StdArray<V, $width>
+        where
+            V: SharedValue,
+            <V as Serializable>::Size: Mul<$width_ty>,
+            <<V as Serializable>::Size as Mul<$width_ty>>::Output: ArrayLength,
+        {
+            type Size = <<V as Serializable>::Size as Mul<$width_ty>>::Output;
+            type DeserializationError = <V as Serializable>::DeserializationError;
+
+            fn serialize(&self, buf: &mut GenericArray<u8, Self::Size>) {
+                let sz: usize = (<V as SharedValue>::BITS / 8).try_into().unwrap();
+                for i in 0..$width {
+                    self.0[i].serialize(
+                        GenericArray::try_from_mut_slice(&mut buf[sz * i..sz * (i + 1)]).unwrap(),
+                    );
+                }
+            }
+
+            fn deserialize(
+                buf: &GenericArray<u8, Self::Size>,
+            ) -> Result<Self, Self::DeserializationError> {
+                let sz: usize = (<V as SharedValue>::BITS / 8).try_into().unwrap();
+                let mut res = [V::ZERO; $width];
+                for i in 0..$width {
+                    res[i] = V::deserialize(GenericArray::from_slice(&buf[sz * i..sz * (i + 1)]))?;
+                }
+                Ok(StdArray(res))
+            }
         }
-        Ok(StdArray(res))
-    }
+    };
 }
 
+impl_serializable!(16, U16);
+impl_serializable!(32, U32);
+impl_serializable!(64, U64);
+impl_serializable!(256, U256);
+
 #[cfg(all(test, unit_test))]
 mod test {
     use std::iter;