diff --git a/sm4/Cargo.toml b/sm4/Cargo.toml index 08e8c6b4..87c04d04 100644 --- a/sm4/Cargo.toml +++ b/sm4/Cargo.toml @@ -14,6 +14,10 @@ categories = ["cryptography", "no-std"] [dependencies] cipher = "0.4.2" +cfg-if = "1" + +[target.'cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))'.dependencies] +cpufeatures = "0.2.11" [dev-dependencies] cipher = { version = "0.4.2", features = ["dev"] } diff --git a/sm4/src/armv8/autodetect/linux.rs b/sm4/src/armv8/autodetect/linux.rs new file mode 100644 index 00000000..d164bb4d --- /dev/null +++ b/sm4/src/armv8/autodetect/linux.rs @@ -0,0 +1,117 @@ +#![allow(unsafe_code)] + +use cipher::{ + consts::U16, AlgorithmName, BlockCipher, BlockClosure, BlockDecrypt, BlockEncrypt, + BlockSizeUser, Key, KeyInit, KeySizeUser, +}; +use core::{fmt, mem::ManuallyDrop}; + +use crate::armv8::{neon::Sm4 as NeonSm4, sm4e::Sm4 as CryptoExtensionSm4}; + +cpufeatures::new!(sm4_intrinsics, "sm4"); + +union Sm4Cipher { + sm4: ManuallyDrop, + neon: ManuallyDrop, +} + +/// SM4 block cipher. +pub struct Sm4 { + cipher: Sm4Cipher, + token: sm4_intrinsics::InitToken, +} + +impl KeySizeUser for Sm4 { + type KeySize = U16; +} + +impl KeyInit for Sm4 { + #[inline] + fn new(key: &Key) -> Self { + let (token, intrinsics_presense) = sm4_intrinsics::init_get(); + + let cipher = if intrinsics_presense { + Sm4Cipher { + sm4: ManuallyDrop::new(CryptoExtensionSm4::new(key)), + } + } else { + Sm4Cipher { + neon: ManuallyDrop::new(NeonSm4::new(key)), + } + }; + + Self { cipher, token } + } +} + +impl Clone for Sm4 { + fn clone(&self) -> Self { + let cipher = if self.token.get() { + Sm4Cipher { + sm4: unsafe { self.cipher.sm4.clone() }, + } + } else { + Sm4Cipher { + neon: unsafe { self.cipher.neon.clone() }, + } + }; + + Self { + cipher, + token: self.token, + } + } +} + +impl BlockSizeUser for Sm4 { + type BlockSize = U16; +} + +impl BlockCipher for Sm4 {} + +impl BlockEncrypt for Sm4 { + fn encrypt_with_backend(&self, f: impl BlockClosure) { + unsafe { + if self.token.get() { + self.cipher.sm4.encrypt_with_backend(f); + } else { + self.cipher.neon.encrypt_with_backend(f); + } + } + } +} + +impl BlockDecrypt for Sm4 { + fn decrypt_with_backend(&self, f: impl BlockClosure) { + unsafe { + if self.token.get() { + self.cipher.sm4.decrypt_with_backend(f); + } else { + self.cipher.neon.decrypt_with_backend(f); + } + } + } +} + +impl fmt::Debug for Sm4 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + f.write_str(concat!(stringify!(Sm4), " { .. }")) + } +} + +impl AlgorithmName for Sm4 { + fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(stringify!(Sm4)) + } +} + +impl Drop for Sm4 { + fn drop(&mut self) { + #[allow(unsafe_code)] + if self.token.get() { + unsafe { ManuallyDrop::drop(&mut self.cipher.sm4) } + } else { + unsafe { ManuallyDrop::drop(&mut self.cipher.neon) } + } + } +} diff --git a/sm4/src/armv8/autodetect/mod.rs b/sm4/src/armv8/autodetect/mod.rs new file mode 100644 index 00000000..bffb8722 --- /dev/null +++ b/sm4/src/armv8/autodetect/mod.rs @@ -0,0 +1,11 @@ +use cfg_if::cfg_if; + +cfg_if! { + if #[cfg(any(target_os = "linux", target_os = "android"))] { + mod linux; + pub use self::linux::*; + } else { + mod others; + pub use self::others::*; + } +} diff --git a/sm4/src/armv8/autodetect/others.rs b/sm4/src/armv8/autodetect/others.rs new file mode 100644 index 00000000..e4f141af --- /dev/null +++ b/sm4/src/armv8/autodetect/others.rs @@ -0,0 +1 @@ +pub use crate::armv8::neon::*; diff --git a/sm4/src/armv8/intrinsics.rs b/sm4/src/armv8/intrinsics.rs new file mode 100644 index 00000000..0bd3a0b6 --- /dev/null +++ b/sm4/src/armv8/intrinsics.rs @@ -0,0 +1,31 @@ +//! ARMv8 extension intrinsics + +#![allow(unsafe_code)] + +use core::arch::{aarch64::*, asm}; + +#[inline] +#[target_feature(enable = "sm4")] +pub(super) unsafe fn vsm4eq_u32(mut a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { + asm!( + "SM4E {d:v}.4S, {n:v}.4S", + d = inout(vreg) a, + n = in(vreg) b, + options(pure, nomem, nostack, preserves_flags) + ); + a +} + +#[inline] +#[target_feature(enable = "sm4")] +pub(super) unsafe fn vsm4ekeyq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { + let mut key: uint32x4_t; + asm!( + "SM4EKEY {d:v}.4S, {n:v}.4S, {m:v}.4S", + d = out(vreg) key, + n = in(vreg) a, + m = in(vreg) b, + options(pure, nomem, nostack, preserves_flags) + ); + key +} diff --git a/sm4/src/armv8/mod.rs b/sm4/src/armv8/mod.rs new file mode 100644 index 00000000..d0c15c98 --- /dev/null +++ b/sm4/src/armv8/mod.rs @@ -0,0 +1,5 @@ +pub mod autodetect; +mod intrinsics; +mod neon; +#[cfg(any(target_os = "linux", target_os = "android"))] +mod sm4e; diff --git a/sm4/src/armv8/neon.rs b/sm4/src/armv8/neon.rs new file mode 100644 index 00000000..60535125 --- /dev/null +++ b/sm4/src/armv8/neon.rs @@ -0,0 +1,260 @@ +//! SM4 NEON +//! +//! Implementation was borrowed from by kentle. + +#![allow(unsafe_code)] + +#[cfg(feature = "zeroize")] +use cipher::zeroize::{Zeroize, ZeroizeOnDrop}; +use cipher::{ + consts::{U16, U4}, + generic_array::GenericArray, + inout::InOut, + AlgorithmName, Block, BlockCipher, BlockDecrypt, BlockSizeUser, Key, KeyInit, KeySizeUser, + ParBlocks, ParBlocksSizeUser, +}; +use cipher::{BlockBackend, BlockEncrypt}; +use core::{arch::aarch64::*, fmt}; + +use crate::consts::SBOX; + +type ParBlocks4 = GenericArray, U4>; + +#[inline] +#[target_feature(enable = "neon")] +unsafe fn sbox_table_lookup( + sbox_table: &[uint8x16x4_t; 4], + b: uint32x4_t, + dec: uint8x16_t, +) -> uint32x4_t { + let b0 = vreinterpretq_u8_u32(b); + let r0 = vqtbl4q_u8(sbox_table[0], b0); + + let b1 = vsubq_u8(b0, dec); + let r1 = vqtbl4q_u8(sbox_table[1], b1); + + let b2 = vsubq_u8(b1, dec); + let r2 = vqtbl4q_u8(sbox_table[2], b2); + + let b3 = vsubq_u8(b2, dec); + let r3 = vqtbl4q_u8(sbox_table[3], b3); + + // Join results + vreinterpretq_u32_u8(veorq_u8(veorq_u8(veorq_u8(r0, r1), r2), r3)) +} + +#[inline] +#[target_feature(enable = "neon")] +pub(super) unsafe fn sm4_process4( + blocks: InOut<'_, '_, ParBlocks4>, + rk: &[u32; 32], + encrypt: bool, +) { + // SBox + let sbox_table: [uint8x16x4_t; 4] = [ + uint8x16x4_t( + vld1q_u8(SBOX.as_ptr().add(64 * 0 + 16 * 0)), + vld1q_u8(SBOX.as_ptr().add(64 * 0 + 16 * 1)), + vld1q_u8(SBOX.as_ptr().add(64 * 0 + 16 * 2)), + vld1q_u8(SBOX.as_ptr().add(64 * 0 + 16 * 3)), + ), + uint8x16x4_t( + vld1q_u8(SBOX.as_ptr().add(64 * 1 + 16 * 0)), + vld1q_u8(SBOX.as_ptr().add(64 * 1 + 16 * 1)), + vld1q_u8(SBOX.as_ptr().add(64 * 1 + 16 * 2)), + vld1q_u8(SBOX.as_ptr().add(64 * 1 + 16 * 3)), + ), + uint8x16x4_t( + vld1q_u8(SBOX.as_ptr().add(64 * 2 + 16 * 0)), + vld1q_u8(SBOX.as_ptr().add(64 * 2 + 16 * 1)), + vld1q_u8(SBOX.as_ptr().add(64 * 2 + 16 * 2)), + vld1q_u8(SBOX.as_ptr().add(64 * 2 + 16 * 3)), + ), + uint8x16x4_t( + vld1q_u8(SBOX.as_ptr().add(64 * 3 + 16 * 0)), + vld1q_u8(SBOX.as_ptr().add(64 * 3 + 16 * 1)), + vld1q_u8(SBOX.as_ptr().add(64 * 3 + 16 * 2)), + vld1q_u8(SBOX.as_ptr().add(64 * 3 + 16 * 3)), + ), + ]; + + // Load data, 4 blocks + let (in_ptr, out_ptr) = blocks.into_raw(); + let mut x: uint32x4x4_t = vld4q_u32(in_ptr as *const _); + + static SUB_DATA: [u8; 16] = [ + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + ]; + + // Index -64 for SBox table lookup + let dec = vld1q_u8(SUB_DATA.as_ptr()); + + // Reverse every 8bits in each blocks + x.0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x.0))); + x.1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x.1))); + x.2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x.2))); + x.3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x.3))); + + // Process loop + for i in 0..32 { + // x1 xor x2 xor x3 xor rk[i] + let mut b = if encrypt { + vdupq_n_u32(rk[i]) + } else { + vdupq_n_u32(rk[31 - i]) + }; + b = veorq_u32(b, x.1); + b = veorq_u32(b, x.2); + b = veorq_u32(b, x.3); + + // SBox lookup + b = sbox_table_lookup(&sbox_table, b, dec); + x.0 = veorq_u32(x.0, b); + + let t1 = vshlq_n_u32(b, 2); + let t2 = vshrq_n_u32(b, 32 - 2); + let t3 = veorq_u32(t1, t2); + x.0 = veorq_u32(x.0, t3); + + let t1 = vshlq_n_u32(b, 10); + let t2 = vshrq_n_u32(b, 32 - 10); + let t3 = veorq_u32(t1, t2); + x.0 = veorq_u32(x.0, t3); + + let t1 = vshlq_n_u32(b, 18); + let t2 = vshrq_n_u32(b, 32 - 18); + let t3 = veorq_u32(t1, t2); + x.0 = veorq_u32(x.0, t3); + + let t1 = vshlq_n_u32(b, 24); + let t2 = vshrq_n_u32(b, 32 - 24); + let t3 = veorq_u32(t1, t2); + x.0 = veorq_u32(x.0, t3); + + b = x.0; + x.0 = x.1; + x.1 = x.2; + x.2 = x.3; + x.3 = b; + } + + // Reverse result blocks + let b0 = x.0; + x.0 = x.3; + x.3 = b0; + let b1 = x.1; + x.1 = x.2; + x.2 = b1; + + // Reverse 8bits in blocks + x.0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x.0))); + x.1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x.1))); + x.2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x.2))); + x.3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x.3))); + + vst4q_u32(out_ptr as *mut _, x); +} + +/// SM4 block cipher. +#[derive(Clone)] +pub struct Sm4 { + rk: [u32; 32], +} + +impl BlockCipher for Sm4 {} + +impl KeySizeUser for Sm4 { + type KeySize = U16; +} + +impl KeyInit for Sm4 { + fn new(key: &Key) -> Self { + Sm4 { + rk: crate::soft::sm4_init_key::(key), + } + } +} + +impl fmt::Debug for Sm4 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("Sm4 { ... }") + } +} + +impl AlgorithmName for Sm4 { + fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("Sm4") + } +} + +#[cfg(feature = "zeroize")] +#[cfg_attr(docsrs, doc(cfg(feature = "zeroize")))] +impl Drop for Sm4 { + fn drop(&mut self) { + self.rk.zeroize(); + } +} + +#[cfg(feature = "zeroize")] +#[cfg_attr(docsrs, doc(cfg(feature = "zeroize")))] +impl ZeroizeOnDrop for Sm4 {} + +impl BlockSizeUser for Sm4 { + type BlockSize = U16; +} + +impl BlockEncrypt for Sm4 { + fn encrypt_with_backend(&self, f: impl cipher::BlockClosure) { + f.call(&mut Sm4Enc(self)) + } +} + +pub struct Sm4Enc<'a>(&'a Sm4); + +impl<'a> BlockSizeUser for Sm4Enc<'a> { + type BlockSize = U16; +} + +impl<'a> ParBlocksSizeUser for Sm4Enc<'a> { + type ParBlocksSize = U4; +} + +impl<'a> BlockBackend for Sm4Enc<'a> { + #[inline(always)] + fn proc_block(&mut self, block: InOut<'_, '_, Block>) { + crate::soft::sm4_encrypt::(block, &self.0.rk); + } + + #[inline(always)] + fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, ParBlocks>) { + unsafe { sm4_process4::(blocks, &self.0.rk, true) } + } +} + +impl BlockDecrypt for Sm4 { + fn decrypt_with_backend(&self, f: impl cipher::BlockClosure) { + f.call(&mut Sm4Dec(self)) + } +} + +pub struct Sm4Dec<'a>(&'a Sm4); + +impl<'a> BlockSizeUser for Sm4Dec<'a> { + type BlockSize = U16; +} + +impl<'a> ParBlocksSizeUser for Sm4Dec<'a> { + type ParBlocksSize = U4; +} + +impl<'a> BlockBackend for Sm4Dec<'a> { + #[inline(always)] + fn proc_block(&mut self, block: InOut<'_, '_, Block>) { + crate::soft::sm4_decrypt::(block, &self.0.rk); + } + + #[inline(always)] + fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, ParBlocks>) { + unsafe { sm4_process4::(blocks, &self.0.rk, false) } + } +} diff --git a/sm4/src/armv8/sm4e.rs b/sm4/src/armv8/sm4e.rs new file mode 100644 index 00000000..7c3873f8 --- /dev/null +++ b/sm4/src/armv8/sm4e.rs @@ -0,0 +1,444 @@ +//! SM4 implementation with SM4 extension instruction set +//! +//! Implementation is from and Linux kernel arch/arm64/crypto/sm4-ce-core.S + +#![allow(unsafe_code)] + +#[cfg(feature = "zeroize")] +use cipher::zeroize::ZeroizeOnDrop; +use cipher::{ + consts::{U16, U4, U8}, + generic_array::GenericArray, + inout::{InOut, InOutBuf}, + AlgorithmName, Block, BlockCipher, BlockDecrypt, BlockSizeUser, Key, KeyInit, KeySizeUser, + ParBlocks, ParBlocksSizeUser, Unsigned, +}; +use cipher::{BlockBackend, BlockEncrypt}; +use core::{arch::aarch64::*, fmt}; + +use crate::consts::{CK, FK}; + +#[inline] +#[target_feature(enable = "sm4")] +pub(crate) unsafe fn sm4_init_key(key: &Key) -> [u32; 32] { + let mut mk: uint8x16_t = vld1q_u8(key.as_ptr() as *const _); + mk = vrev32q_u8(mk); + let fk: uint8x16_t = vld1q_u8(FK.as_ptr() as *const _); + + let ck0 = vld1q_u32(CK.as_ptr().add(0)); + let ck1 = vld1q_u32(CK.as_ptr().add(4)); + let ck2 = vld1q_u32(CK.as_ptr().add(8)); + let ck3 = vld1q_u32(CK.as_ptr().add(12)); + let ck4 = vld1q_u32(CK.as_ptr().add(16)); + let ck5 = vld1q_u32(CK.as_ptr().add(20)); + let ck6 = vld1q_u32(CK.as_ptr().add(24)); + let ck7 = vld1q_u32(CK.as_ptr().add(28)); + + // input ^ mk + let rk = vreinterpretq_u32_u8(veorq_u8(mk, fk)); + + let k0 = super::intrinsics::vsm4ekeyq_u32(rk, ck0); + let k1 = super::intrinsics::vsm4ekeyq_u32(k0, ck1); + let k2 = super::intrinsics::vsm4ekeyq_u32(k1, ck2); + let k3 = super::intrinsics::vsm4ekeyq_u32(k2, ck3); + let k4 = super::intrinsics::vsm4ekeyq_u32(k3, ck4); + let k5 = super::intrinsics::vsm4ekeyq_u32(k4, ck5); + let k6 = super::intrinsics::vsm4ekeyq_u32(k5, ck6); + let k7 = super::intrinsics::vsm4ekeyq_u32(k6, ck7); + + let mut rkey = [0u32; 32]; + vst1q_u32(rkey.as_mut_ptr().add(0), k0); + vst1q_u32(rkey.as_mut_ptr().add(4), k1); + vst1q_u32(rkey.as_mut_ptr().add(8), k2); + vst1q_u32(rkey.as_mut_ptr().add(12), k3); + vst1q_u32(rkey.as_mut_ptr().add(16), k4); + vst1q_u32(rkey.as_mut_ptr().add(20), k5); + vst1q_u32(rkey.as_mut_ptr().add(24), k6); + vst1q_u32(rkey.as_mut_ptr().add(28), k7); + + rkey +} + +#[inline] +unsafe fn qswap_32(b: uint32x4_t) -> uint32x4_t { + static QSWAP_TBL: [u8; 16] = [12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3]; + vreinterpretq_u32_u8(vqtbl1q_u8( + vreinterpretq_u8_u32(b), + vld1q_u8(QSWAP_TBL.as_ptr()), + )) +} + +#[inline] +unsafe fn bswap_32(b: uint32x4_t) -> uint32x4_t { + vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(b))) +} + +/// Swap both the quad-words and bytes within each word +/// equivalent to return bswap_32(qswap_32(B)) +#[inline] +unsafe fn bqswap_32(b: uint32x4_t) -> uint32x4_t { + static BSWAP_TBL: [u8; 16] = [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]; + return vreinterpretq_u32_u8(vqtbl1q_u8( + vreinterpretq_u8_u32(b), + vld1q_u8(BSWAP_TBL.as_ptr()), + )); +} + +macro_rules! sm4_e { + ($($b:ident),+ @ $k:expr) => { + $( + $b = super::intrinsics::vsm4eq_u32($b, $k); + )+ + } +} + +type ParBlocks4 = GenericArray, U4>; +type ParBlocks8 = GenericArray, U8>; + +#[inline] +#[target_feature(enable = "sm4")] +pub(super) unsafe fn sm4_encrypt4( + blocks: InOut<'_, '_, ParBlocks4>, + rk: &[uint32x4_t; 8], +) { + let (in_ptr, out_ptr) = blocks.into_raw(); + let input32 = in_ptr as *const u32; + let output32 = out_ptr as *mut u32; + + let mut b0 = bswap_32(vld1q_u32(input32.add(0))); + let mut b1 = bswap_32(vld1q_u32(input32.add(4))); + let mut b2 = bswap_32(vld1q_u32(input32.add(8))); + let mut b3 = bswap_32(vld1q_u32(input32.add(12))); + + sm4_e!(b0, b1, b2, b3 @ rk[0]); + sm4_e!(b0, b1, b2, b3 @ rk[1]); + sm4_e!(b0, b1, b2, b3 @ rk[2]); + sm4_e!(b0, b1, b2, b3 @ rk[3]); + sm4_e!(b0, b1, b2, b3 @ rk[4]); + sm4_e!(b0, b1, b2, b3 @ rk[5]); + sm4_e!(b0, b1, b2, b3 @ rk[6]); + sm4_e!(b0, b1, b2, b3 @ rk[7]); + + vst1q_u32(output32.add(0), bqswap_32(b0)); + vst1q_u32(output32.add(4), bqswap_32(b1)); + vst1q_u32(output32.add(8), bqswap_32(b2)); + vst1q_u32(output32.add(12), bqswap_32(b3)); +} + +#[inline] +#[target_feature(enable = "sm4")] +pub(super) unsafe fn sm4_encrypt8( + blocks: InOut<'_, '_, ParBlocks8>, + rk: &[uint32x4_t; 8], +) { + let (in_ptr, out_ptr) = blocks.into_raw(); + let input32 = in_ptr as *const u32; + let output32 = out_ptr as *mut u32; + + let mut b0 = bswap_32(vld1q_u32(input32.add(0))); + let mut b1 = bswap_32(vld1q_u32(input32.add(4))); + let mut b2 = bswap_32(vld1q_u32(input32.add(8))); + let mut b3 = bswap_32(vld1q_u32(input32.add(12))); + let mut b4 = bswap_32(vld1q_u32(input32.add(16))); + let mut b5 = bswap_32(vld1q_u32(input32.add(20))); + let mut b6 = bswap_32(vld1q_u32(input32.add(24))); + let mut b7 = bswap_32(vld1q_u32(input32.add(28))); + + sm4_e!(b0, b1, b2, b3, b4, b5, b6, b7 @ rk[0]); + sm4_e!(b0, b1, b2, b3, b4, b5, b6, b7 @ rk[1]); + sm4_e!(b0, b1, b2, b3, b4, b5, b6, b7 @ rk[2]); + sm4_e!(b0, b1, b2, b3, b4, b5, b6, b7 @ rk[3]); + sm4_e!(b0, b1, b2, b3, b4, b5, b6, b7 @ rk[4]); + sm4_e!(b0, b1, b2, b3, b4, b5, b6, b7 @ rk[5]); + sm4_e!(b0, b1, b2, b3, b4, b5, b6, b7 @ rk[6]); + sm4_e!(b0, b1, b2, b3, b4, b5, b6, b7 @ rk[7]); + + vst1q_u32(output32.add(0), bqswap_32(b0)); + vst1q_u32(output32.add(4), bqswap_32(b1)); + vst1q_u32(output32.add(8), bqswap_32(b2)); + vst1q_u32(output32.add(12), bqswap_32(b3)); + vst1q_u32(output32.add(16), bqswap_32(b4)); + vst1q_u32(output32.add(20), bqswap_32(b5)); + vst1q_u32(output32.add(24), bqswap_32(b6)); + vst1q_u32(output32.add(28), bqswap_32(b7)); +} + +#[inline] +#[target_feature(enable = "sm4")] +pub(super) unsafe fn sm4_encrypt1( + block: InOut<'_, '_, Block>, + rk: &[uint32x4_t; 8], +) { + let (in_ptr, out_ptr) = block.into_raw(); + let input32 = in_ptr as *const u32; + let output32 = out_ptr as *mut u32; + + let mut b = bswap_32(vld1q_u32(input32)); + + sm4_e!(b @ rk[0]); + sm4_e!(b @ rk[1]); + sm4_e!(b @ rk[2]); + sm4_e!(b @ rk[3]); + sm4_e!(b @ rk[4]); + sm4_e!(b @ rk[5]); + sm4_e!(b @ rk[6]); + sm4_e!(b @ rk[7]); + + vst1q_u32(output32, bqswap_32(b)); +} + +#[inline] +#[target_feature(enable = "sm4")] +pub(super) unsafe fn sm4_decrypt4( + blocks: InOut<'_, '_, ParBlocks4>, + rk: &[uint32x4_t; 8], +) { + let (in_ptr, out_ptr) = blocks.into_raw(); + let input32 = in_ptr as *const u32; + let output32 = out_ptr as *mut u32; + + let mut b0 = bswap_32(vld1q_u32(input32.add(0))); + let mut b1 = bswap_32(vld1q_u32(input32.add(4))); + let mut b2 = bswap_32(vld1q_u32(input32.add(8))); + let mut b3 = bswap_32(vld1q_u32(input32.add(12))); + + sm4_e!(b0, b1, b2, b3 @ rk[7]); + sm4_e!(b0, b1, b2, b3 @ rk[6]); + sm4_e!(b0, b1, b2, b3 @ rk[5]); + sm4_e!(b0, b1, b2, b3 @ rk[4]); + sm4_e!(b0, b1, b2, b3 @ rk[3]); + sm4_e!(b0, b1, b2, b3 @ rk[2]); + sm4_e!(b0, b1, b2, b3 @ rk[1]); + sm4_e!(b0, b1, b2, b3 @ rk[0]); + + vst1q_u32(output32.add(0), bqswap_32(b0)); + vst1q_u32(output32.add(4), bqswap_32(b1)); + vst1q_u32(output32.add(8), bqswap_32(b2)); + vst1q_u32(output32.add(12), bqswap_32(b3)); +} + +#[inline] +#[target_feature(enable = "sm4")] +pub(super) unsafe fn sm4_decrypt8( + blocks: InOut<'_, '_, ParBlocks8>, + rk: &[uint32x4_t; 8], +) { + let (in_ptr, out_ptr) = blocks.into_raw(); + let input32 = in_ptr as *const u32; + let output32 = out_ptr as *mut u32; + + let mut b0 = bswap_32(vld1q_u32(input32.add(0))); + let mut b1 = bswap_32(vld1q_u32(input32.add(4))); + let mut b2 = bswap_32(vld1q_u32(input32.add(8))); + let mut b3 = bswap_32(vld1q_u32(input32.add(12))); + let mut b4 = bswap_32(vld1q_u32(input32.add(16))); + let mut b5 = bswap_32(vld1q_u32(input32.add(20))); + let mut b6 = bswap_32(vld1q_u32(input32.add(24))); + let mut b7 = bswap_32(vld1q_u32(input32.add(28))); + + sm4_e!(b0, b1, b2, b3, b4, b5, b6, b7 @ rk[7]); + sm4_e!(b0, b1, b2, b3, b4, b5, b6, b7 @ rk[6]); + sm4_e!(b0, b1, b2, b3, b4, b5, b6, b7 @ rk[5]); + sm4_e!(b0, b1, b2, b3, b4, b5, b6, b7 @ rk[4]); + sm4_e!(b0, b1, b2, b3, b4, b5, b6, b7 @ rk[3]); + sm4_e!(b0, b1, b2, b3, b4, b5, b6, b7 @ rk[2]); + sm4_e!(b0, b1, b2, b3, b4, b5, b6, b7 @ rk[1]); + sm4_e!(b0, b1, b2, b3, b4, b5, b6, b7 @ rk[0]); + + vst1q_u32(output32.add(0), bqswap_32(b0)); + vst1q_u32(output32.add(4), bqswap_32(b1)); + vst1q_u32(output32.add(8), bqswap_32(b2)); + vst1q_u32(output32.add(12), bqswap_32(b3)); + vst1q_u32(output32.add(16), bqswap_32(b4)); + vst1q_u32(output32.add(20), bqswap_32(b5)); + vst1q_u32(output32.add(24), bqswap_32(b6)); + vst1q_u32(output32.add(28), bqswap_32(b7)); +} + +#[inline] +#[target_feature(enable = "sm4")] +pub(super) unsafe fn sm4_decrypt1( + block: InOut<'_, '_, Block>, + rk: &[uint32x4_t; 8], +) { + let (in_ptr, out_ptr) = block.into_raw(); + let input32 = in_ptr as *const u32; + let output32 = out_ptr as *mut u32; + + let mut b = bswap_32(vld1q_u32(input32)); + + sm4_e!(b @ rk[7]); + sm4_e!(b @ rk[6]); + sm4_e!(b @ rk[5]); + sm4_e!(b @ rk[4]); + sm4_e!(b @ rk[3]); + sm4_e!(b @ rk[2]); + sm4_e!(b @ rk[1]); + sm4_e!(b @ rk[0]); + + vst1q_u32(output32, bqswap_32(b)); +} + +/// SM4 block cipher. +#[derive(Clone)] +pub struct Sm4 { + erk: [uint32x4_t; 8], + drk: [uint32x4_t; 8], +} + +impl BlockCipher for Sm4 {} + +impl KeySizeUser for Sm4 { + type KeySize = U16; +} + +impl KeyInit for Sm4 { + fn new(key: &Key) -> Self { + unsafe { + let rk = sm4_init_key::(key); + + let erk = [ + vld1q_u32(rk.as_ptr().add(0)), + vld1q_u32(rk.as_ptr().add(4)), + vld1q_u32(rk.as_ptr().add(8)), + vld1q_u32(rk.as_ptr().add(12)), + vld1q_u32(rk.as_ptr().add(16)), + vld1q_u32(rk.as_ptr().add(20)), + vld1q_u32(rk.as_ptr().add(24)), + vld1q_u32(rk.as_ptr().add(28)), + ]; + + let drk = [ + qswap_32(erk[0]), + qswap_32(erk[1]), + qswap_32(erk[2]), + qswap_32(erk[3]), + qswap_32(erk[4]), + qswap_32(erk[5]), + qswap_32(erk[6]), + qswap_32(erk[7]), + ]; + + Sm4 { erk, drk } + } + } +} + +impl fmt::Debug for Sm4 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("Sm4 { ... }") + } +} + +impl AlgorithmName for Sm4 { + fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("Sm4") + } +} + +#[cfg(feature = "zeroize")] +#[cfg_attr(docsrs, doc(cfg(feature = "zeroize")))] +impl Drop for Sm4 { + fn drop(&mut self) { + unsafe { + for i in 0..self.erk.len() { + self.erk[i] = veorq_u32(self.erk[i], self.erk[i]); + } + for i in 0..self.drk.len() { + self.drk[i] = veorq_u32(self.drk[i], self.drk[i]); + } + } + } +} + +#[cfg(feature = "zeroize")] +#[cfg_attr(docsrs, doc(cfg(feature = "zeroize")))] +impl ZeroizeOnDrop for Sm4 {} + +impl BlockSizeUser for Sm4 { + type BlockSize = U16; +} + +impl BlockEncrypt for Sm4 { + fn encrypt_with_backend(&self, f: impl cipher::BlockClosure) { + f.call(&mut Sm4Enc(self)) + } +} + +pub struct Sm4Enc<'a>(&'a Sm4); + +impl<'a> BlockSizeUser for Sm4Enc<'a> { + type BlockSize = U16; +} + +impl<'a> ParBlocksSizeUser for Sm4Enc<'a> { + type ParBlocksSize = U8; +} + +impl<'a> BlockBackend for Sm4Enc<'a> { + #[inline(always)] + fn proc_block(&mut self, block: InOut<'_, '_, Block>) { + unsafe { sm4_encrypt1::(block, &self.0.erk) } + } + + #[inline(always)] + fn proc_tail_blocks(&mut self, blocks: InOutBuf<'_, '_, Block>) { + assert!(blocks.len() < Self::ParBlocksSize::USIZE); + + let (chunks, tail) = blocks.into_chunks::(); + for chunk in chunks { + unsafe { sm4_encrypt4::(chunk, &self.0.erk) } + } + + for block in tail { + self.proc_block(block); + } + } + + #[inline(always)] + fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, ParBlocks>) { + unsafe { sm4_encrypt8::(blocks, &self.0.erk) } + } +} + +impl BlockDecrypt for Sm4 { + fn decrypt_with_backend(&self, f: impl cipher::BlockClosure) { + f.call(&mut Sm4Dec(self)) + } +} + +pub struct Sm4Dec<'a>(&'a Sm4); + +impl<'a> BlockSizeUser for Sm4Dec<'a> { + type BlockSize = U16; +} + +impl<'a> ParBlocksSizeUser for Sm4Dec<'a> { + type ParBlocksSize = U8; +} + +impl<'a> BlockBackend for Sm4Dec<'a> { + #[inline(always)] + fn proc_block(&mut self, block: InOut<'_, '_, Block>) { + unsafe { sm4_decrypt1::(block, &self.0.drk) } + } + + #[inline(always)] + fn proc_tail_blocks(&mut self, blocks: InOutBuf<'_, '_, Block>) { + assert!(blocks.len() < Self::ParBlocksSize::USIZE); + + let (chunks, tail) = blocks.into_chunks::(); + for chunk in chunks { + unsafe { sm4_decrypt4::(chunk, &self.0.drk) } + } + + for block in tail { + self.proc_block(block); + } + } + + #[inline(always)] + fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, ParBlocks>) { + unsafe { sm4_decrypt8::(blocks, &self.0.drk) } + } +} diff --git a/sm4/src/lib.rs b/sm4/src/lib.rs index 2596cf27..b52e1c85 100644 --- a/sm4/src/lib.rs +++ b/sm4/src/lib.rs @@ -19,157 +19,23 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![warn(missing_docs, rust_2018_idioms)] +use cfg_if::cfg_if; pub use cipher; -use cipher::{consts::U16, AlgorithmName, BlockCipher, Key, KeyInit, KeySizeUser}; -use core::fmt; - -#[cfg(feature = "zeroize")] -use cipher::zeroize::{Zeroize, ZeroizeOnDrop}; - mod consts; -use consts::{CK, FK, SBOX}; - -#[inline] -fn tau(a: u32) -> u32 { - let mut buf = a.to_be_bytes(); - buf[0] = SBOX[buf[0] as usize]; - buf[1] = SBOX[buf[1] as usize]; - buf[2] = SBOX[buf[2] as usize]; - buf[3] = SBOX[buf[3] as usize]; - u32::from_be_bytes(buf) -} - -/// L: linear transformation -#[inline] -fn el(b: u32) -> u32 { - b ^ b.rotate_left(2) ^ b.rotate_left(10) ^ b.rotate_left(18) ^ b.rotate_left(24) -} - -#[inline] -fn el_prime(b: u32) -> u32 { - b ^ b.rotate_left(13) ^ b.rotate_left(23) -} - -#[inline] -fn t(val: u32) -> u32 { - el(tau(val)) -} - -#[inline] -fn t_prime(val: u32) -> u32 { - el_prime(tau(val)) -} - -/// SM4 block cipher. -#[derive(Clone)] -pub struct Sm4 { - rk: [u32; 32], -} - -impl BlockCipher for Sm4 {} - -impl KeySizeUser for Sm4 { - type KeySize = U16; -} - -impl KeyInit for Sm4 { - fn new(key: &Key) -> Self { - let mk = [ - u32::from_be_bytes(key[0..4].try_into().unwrap()), - u32::from_be_bytes(key[4..8].try_into().unwrap()), - u32::from_be_bytes(key[8..12].try_into().unwrap()), - u32::from_be_bytes(key[12..16].try_into().unwrap()), - ]; - let mut rk = [0u32; 32]; - let mut k = [mk[0] ^ FK[0], mk[1] ^ FK[1], mk[2] ^ FK[2], mk[3] ^ FK[3]]; - - for i in 0..8 { - k[0] ^= t_prime(k[1] ^ k[2] ^ k[3] ^ CK[i * 4]); - k[1] ^= t_prime(k[2] ^ k[3] ^ k[0] ^ CK[i * 4 + 1]); - k[2] ^= t_prime(k[3] ^ k[0] ^ k[1] ^ CK[i * 4 + 2]); - k[3] ^= t_prime(k[0] ^ k[1] ^ k[2] ^ CK[i * 4 + 3]); - - rk[i * 4] = k[0]; - rk[i * 4 + 1] = k[1]; - rk[i * 4 + 2] = k[2]; - rk[i * 4 + 3] = k[3]; - } - - Sm4 { rk } - } -} - -impl fmt::Debug for Sm4 { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("Sm4 { ... }") +mod soft; + +cfg_if! { + if #[cfg(all(target_arch = "aarch64", sm4_armv8, not(sm4_force_soft)))] { + mod armv8; + pub use self::armv8::autodetect::*; + } else if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + not(sm4_force_soft) + ))] { + mod x86; + pub use self::x86::autodetect::*; + } else { + pub use soft::*; } } - -impl AlgorithmName for Sm4 { - fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("Sm4") - } -} - -#[cfg(feature = "zeroize")] -#[cfg_attr(docsrs, doc(cfg(feature = "zeroize")))] -impl Drop for Sm4 { - fn drop(&mut self) { - self.rk.zeroize(); - } -} - -#[cfg(feature = "zeroize")] -#[cfg_attr(docsrs, doc(cfg(feature = "zeroize")))] -impl ZeroizeOnDrop for Sm4 {} - -cipher::impl_simple_block_encdec!( - Sm4, U16, cipher, block, - encrypt: { - let b = block.get_in(); - let mut x = [ - u32::from_be_bytes(b[0..4].try_into().unwrap()), - u32::from_be_bytes(b[4..8].try_into().unwrap()), - u32::from_be_bytes(b[8..12].try_into().unwrap()), - u32::from_be_bytes(b[12..16].try_into().unwrap()), - ]; - - let rk = &cipher.rk; - for i in 0..8 { - x[0] ^= t(x[1] ^ x[2] ^ x[3] ^ rk[i * 4]); - x[1] ^= t(x[2] ^ x[3] ^ x[0] ^ rk[i * 4 + 1]); - x[2] ^= t(x[3] ^ x[0] ^ x[1] ^ rk[i * 4 + 2]); - x[3] ^= t(x[0] ^ x[1] ^ x[2] ^ rk[i * 4 + 3]); - } - - let block = block.get_out(); - block[0..4].copy_from_slice(&x[3].to_be_bytes()); - block[4..8].copy_from_slice(&x[2].to_be_bytes()); - block[8..12].copy_from_slice(&x[1].to_be_bytes()); - block[12..16].copy_from_slice(&x[0].to_be_bytes()); - } - decrypt: { - let b = block.get_in(); - let mut x = [ - u32::from_be_bytes(b[0..4].try_into().unwrap()), - u32::from_be_bytes(b[4..8].try_into().unwrap()), - u32::from_be_bytes(b[8..12].try_into().unwrap()), - u32::from_be_bytes(b[12..16].try_into().unwrap()), - ]; - - let rk = &cipher.rk; - for i in 0..8 { - x[0] ^= t(x[1] ^ x[2] ^ x[3] ^ rk[31 - i * 4]); - x[1] ^= t(x[2] ^ x[3] ^ x[0] ^ rk[31 - (i * 4 + 1)]); - x[2] ^= t(x[3] ^ x[0] ^ x[1] ^ rk[31 - (i * 4 + 2)]); - x[3] ^= t(x[0] ^ x[1] ^ x[2] ^ rk[31 - (i * 4 + 3)]); - } - - let block = block.get_out(); - block[0..4].copy_from_slice(&x[3].to_be_bytes()); - block[4..8].copy_from_slice(&x[2].to_be_bytes()); - block[8..12].copy_from_slice(&x[1].to_be_bytes()); - block[12..16].copy_from_slice(&x[0].to_be_bytes()); - } -); diff --git a/sm4/src/soft.rs b/sm4/src/soft.rs new file mode 100644 index 00000000..eecbb835 --- /dev/null +++ b/sm4/src/soft.rs @@ -0,0 +1,170 @@ +#[cfg(feature = "zeroize")] +use cipher::zeroize::{Zeroize, ZeroizeOnDrop}; +use cipher::{ + consts::U16, inout::InOut, AlgorithmName, Block, BlockCipher, BlockSizeUser, Key, KeyInit, + KeySizeUser, +}; +use core::{convert::TryInto, fmt}; + +use crate::consts::{CK, FK, SBOX}; + +#[inline] +fn tau(a: u32) -> u32 { + let mut buf = a.to_be_bytes(); + buf[0] = SBOX[buf[0] as usize]; + buf[1] = SBOX[buf[1] as usize]; + buf[2] = SBOX[buf[2] as usize]; + buf[3] = SBOX[buf[3] as usize]; + u32::from_be_bytes(buf) +} + +/// L: linear transformation +#[inline] +fn el(b: u32) -> u32 { + b ^ b.rotate_left(2) ^ b.rotate_left(10) ^ b.rotate_left(18) ^ b.rotate_left(24) +} + +#[inline] +fn el_prime(b: u32) -> u32 { + b ^ b.rotate_left(13) ^ b.rotate_left(23) +} + +#[inline] +fn t(val: u32) -> u32 { + el(tau(val)) +} + +#[inline] +fn t_prime(val: u32) -> u32 { + el_prime(tau(val)) +} + +#[inline] +pub(crate) fn sm4_init_key(key: &Key) -> [u32; 32] { + let mk = [ + u32::from_be_bytes(key[0..4].try_into().unwrap()), + u32::from_be_bytes(key[4..8].try_into().unwrap()), + u32::from_be_bytes(key[8..12].try_into().unwrap()), + u32::from_be_bytes(key[12..16].try_into().unwrap()), + ]; + let mut rk = [0u32; 32]; + let mut k = [mk[0] ^ FK[0], mk[1] ^ FK[1], mk[2] ^ FK[2], mk[3] ^ FK[3]]; + + for i in 0..8 { + k[0] ^= t_prime(k[1] ^ k[2] ^ k[3] ^ CK[i * 4]); + k[1] ^= t_prime(k[2] ^ k[3] ^ k[0] ^ CK[i * 4 + 1]); + k[2] ^= t_prime(k[3] ^ k[0] ^ k[1] ^ CK[i * 4 + 2]); + k[3] ^= t_prime(k[0] ^ k[1] ^ k[2] ^ CK[i * 4 + 3]); + + rk[i * 4] = k[0]; + rk[i * 4 + 1] = k[1]; + rk[i * 4 + 2] = k[2]; + rk[i * 4 + 3] = k[3]; + } + + rk +} + +#[inline] +#[allow(unused)] +pub(super) fn sm4_encrypt(mut block: InOut<'_, '_, Block>, rk: &[u32; 32]) { + let b = block.get_in(); + let mut x = [ + u32::from_be_bytes(b[0..4].try_into().unwrap()), + u32::from_be_bytes(b[4..8].try_into().unwrap()), + u32::from_be_bytes(b[8..12].try_into().unwrap()), + u32::from_be_bytes(b[12..16].try_into().unwrap()), + ]; + + for i in 0..8 { + x[0] ^= t(x[1] ^ x[2] ^ x[3] ^ rk[i * 4]); + x[1] ^= t(x[2] ^ x[3] ^ x[0] ^ rk[i * 4 + 1]); + x[2] ^= t(x[3] ^ x[0] ^ x[1] ^ rk[i * 4 + 2]); + x[3] ^= t(x[0] ^ x[1] ^ x[2] ^ rk[i * 4 + 3]); + } + + let block = block.get_out(); + block[0..4].copy_from_slice(&x[3].to_be_bytes()); + block[4..8].copy_from_slice(&x[2].to_be_bytes()); + block[8..12].copy_from_slice(&x[1].to_be_bytes()); + block[12..16].copy_from_slice(&x[0].to_be_bytes()); +} + +#[inline] +#[allow(unused)] +pub(super) fn sm4_decrypt(mut block: InOut<'_, '_, Block>, rk: &[u32; 32]) { + let b = block.get_in(); + let mut x = [ + u32::from_be_bytes(b[0..4].try_into().unwrap()), + u32::from_be_bytes(b[4..8].try_into().unwrap()), + u32::from_be_bytes(b[8..12].try_into().unwrap()), + u32::from_be_bytes(b[12..16].try_into().unwrap()), + ]; + + for i in 0..8 { + x[0] ^= t(x[1] ^ x[2] ^ x[3] ^ rk[31 - i * 4]); + x[1] ^= t(x[2] ^ x[3] ^ x[0] ^ rk[31 - (i * 4 + 1)]); + x[2] ^= t(x[3] ^ x[0] ^ x[1] ^ rk[31 - (i * 4 + 2)]); + x[3] ^= t(x[0] ^ x[1] ^ x[2] ^ rk[31 - (i * 4 + 3)]); + } + + let block = block.get_out(); + block[0..4].copy_from_slice(&x[3].to_be_bytes()); + block[4..8].copy_from_slice(&x[2].to_be_bytes()); + block[8..12].copy_from_slice(&x[1].to_be_bytes()); + block[12..16].copy_from_slice(&x[0].to_be_bytes()); +} + +/// SM4 block cipher. +#[derive(Clone)] +pub struct Sm4 { + rk: [u32; 32], +} + +impl BlockCipher for Sm4 {} + +impl KeySizeUser for Sm4 { + type KeySize = U16; +} + +impl KeyInit for Sm4 { + fn new(key: &Key) -> Self { + Sm4 { + rk: sm4_init_key::(key), + } + } +} + +impl fmt::Debug for Sm4 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("Sm4 { ... }") + } +} + +impl AlgorithmName for Sm4 { + fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("Sm4") + } +} + +#[cfg(feature = "zeroize")] +#[cfg_attr(docsrs, doc(cfg(feature = "zeroize")))] +impl Drop for Sm4 { + fn drop(&mut self) { + self.rk.zeroize(); + } +} + +#[cfg(feature = "zeroize")] +#[cfg_attr(docsrs, doc(cfg(feature = "zeroize")))] +impl ZeroizeOnDrop for Sm4 {} + +cipher::impl_simple_block_encdec!( + Sm4, U16, cipher, block, + encrypt: { + sm4_encrypt::(block, &cipher.rk); + } + decrypt: { + sm4_decrypt::(block, &cipher.rk); + } +); diff --git a/sm4/src/x86/aesni.rs b/sm4/src/x86/aesni.rs new file mode 100644 index 00000000..200e3366 --- /dev/null +++ b/sm4/src/x86/aesni.rs @@ -0,0 +1,374 @@ +//! SM4 with X86 AES-NI instruction set +//! +//! Implementation was borrowed from by kentle. + +#![allow(unsafe_code)] + +#[cfg(feature = "zeroize")] +use cipher::zeroize::{Zeroize, ZeroizeOnDrop}; +use cipher::{ + consts::{U16, U4}, + generic_array::GenericArray, + inout::InOut, + AlgorithmName, Block, BlockCipher, BlockDecrypt, BlockSizeUser, Key, KeyInit, KeySizeUser, + ParBlocks, ParBlocksSizeUser, +}; +use cipher::{BlockBackend, BlockEncrypt}; +use core::{arch::x86_64::*, fmt}; + +#[inline] +unsafe fn mm_pack0_epi32(a: __m128i, b: __m128i, c: __m128i, d: __m128i) -> __m128i { + _mm_unpacklo_epi64(_mm_unpacklo_epi32(a, b), _mm_unpacklo_epi32(c, d)) +} + +#[inline] +unsafe fn mm_pack1_epi32(a: __m128i, b: __m128i, c: __m128i, d: __m128i) -> __m128i { + _mm_unpackhi_epi64(_mm_unpacklo_epi32(a, b), _mm_unpacklo_epi32(c, d)) +} + +#[inline] +unsafe fn mm_pack2_epi32(a: __m128i, b: __m128i, c: __m128i, d: __m128i) -> __m128i { + _mm_unpacklo_epi64(_mm_unpackhi_epi32(a, b), _mm_unpackhi_epi32(c, d)) +} + +#[inline] +unsafe fn mm_pack3_epi32(a: __m128i, b: __m128i, c: __m128i, d: __m128i) -> __m128i { + _mm_unpackhi_epi64(_mm_unpackhi_epi32(a, b), _mm_unpackhi_epi32(c, d)) +} + +#[inline] +unsafe fn mm_xor2(a: __m128i, b: __m128i) -> __m128i { + _mm_xor_si128(a, b) +} + +#[inline] +unsafe fn mm_xor3(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + mm_xor2(a, mm_xor2(b, c)) +} + +#[inline] +unsafe fn mm_xor4(a: __m128i, b: __m128i, c: __m128i, d: __m128i) -> __m128i { + mm_xor2(a, mm_xor3(b, c, d)) +} + +#[inline] +unsafe fn mm_xor5(a: __m128i, b: __m128i, c: __m128i, d: __m128i, e: __m128i) -> __m128i { + mm_xor2(a, mm_xor4(b, c, d, e)) +} + +#[inline] +unsafe fn mm_xor6( + a: __m128i, + b: __m128i, + c: __m128i, + d: __m128i, + e: __m128i, + f: __m128i, +) -> __m128i { + mm_xor2(a, mm_xor5(b, c, d, e, f)) +} + +macro_rules! mm_rotl_epi32 { + ($a:expr, $n:literal) => { + mm_xor2(_mm_slli_epi32::<$n>($a), _mm_srli_epi32::<{ 32 - $n }>($a)) + }; +} + +#[inline] +unsafe fn mul_matrix(x: __m128i, higher_mask: __m128i, lower_mask: __m128i) -> __m128i { + let and_mask = _mm_set1_epi32(0x0f0f0f0f); + let mut tmp2 = _mm_srli_epi16(x, 4); + let mut tmp1 = _mm_and_si128(x, and_mask); + tmp2 = _mm_and_si128(tmp2, and_mask); + tmp1 = _mm_shuffle_epi8(lower_mask, tmp1); + tmp2 = _mm_shuffle_epi8(higher_mask, tmp2); + _mm_xor_si128(tmp1, tmp2) +} + +#[inline] +unsafe fn mul_matrix_ata(x: __m128i) -> __m128i { + let higher_mask = _mm_set_epi8( + 0x14u8 as i8, + 0x07u8 as i8, + 0xc6u8 as i8, + 0xd5u8 as i8, + 0x6cu8 as i8, + 0x7fu8 as i8, + 0xbeu8 as i8, + 0xadu8 as i8, + 0xb9u8 as i8, + 0xaau8 as i8, + 0x6bu8 as i8, + 0x78u8 as i8, + 0xc1u8 as i8, + 0xd2u8 as i8, + 0x13u8 as i8, + 0x00u8 as i8, + ); + let lower_mask = _mm_set_epi8( + 0xd8u8 as i8, + 0xb8u8 as i8, + 0xfau8 as i8, + 0x9au8 as i8, + 0xc5u8 as i8, + 0xa5u8 as i8, + 0xe7u8 as i8, + 0x87u8 as i8, + 0x5fu8 as i8, + 0x3fu8 as i8, + 0x7du8 as i8, + 0x1du8 as i8, + 0x42u8 as i8, + 0x22u8 as i8, + 0x60u8 as i8, + 0x00u8 as i8, + ); + mul_matrix(x, higher_mask, lower_mask) +} + +#[inline] +unsafe fn mul_matrix_ta(x: __m128i) -> __m128i { + let higher_mask = _mm_set_epi8( + 0x22u8 as i8, + 0x58u8 as i8, + 0x1au8 as i8, + 0x60u8 as i8, + 0x02u8 as i8, + 0x78u8 as i8, + 0x3au8 as i8, + 0x40u8 as i8, + 0x62u8 as i8, + 0x18u8 as i8, + 0x5au8 as i8, + 0x20u8 as i8, + 0x42u8 as i8, + 0x38u8 as i8, + 0x7au8 as i8, + 0x00u8 as i8, + ); + let lower_mask = _mm_set_epi8( + 0xe2u8 as i8, + 0x28u8 as i8, + 0x95u8 as i8, + 0x5fu8 as i8, + 0x69u8 as i8, + 0xa3u8 as i8, + 0x1eu8 as i8, + 0xd4u8 as i8, + 0x36u8 as i8, + 0xfcu8 as i8, + 0x41u8 as i8, + 0x8bu8 as i8, + 0xbdu8 as i8, + 0x77u8 as i8, + 0xcau8 as i8, + 0x00u8 as i8, + ); + mul_matrix(x, higher_mask, lower_mask) +} + +#[inline] +unsafe fn add_tc(x: __m128i) -> __m128i { + let tc = _mm_set1_epi8(0b00100011); + _mm_xor_si128(x, tc) +} + +#[inline] +unsafe fn add_atac(x: __m128i) -> __m128i { + let atac = _mm_set1_epi8(0b00111011); + _mm_xor_si128(x, atac) +} + +#[inline] +unsafe fn sm4_sbox(mut x: __m128i) -> __m128i { + let mask: __m128i = _mm_set_epi8( + 0x03, 0x06, 0x09, 0x0c, 0x0f, 0x02, 0x05, 0x08, 0x0b, 0x0e, 0x01, 0x04, 0x07, 0x0a, 0x0d, + 0x00, + ); + x = _mm_shuffle_epi8(x, mask); // 逆行移位 + x = add_tc(mul_matrix_ta(x)); + x = _mm_aesenclast_si128(x, _mm_setzero_si128()); + add_atac(mul_matrix_ata(x)) +} + +type ParBlocks4 = GenericArray, U4>; + +#[inline] +#[target_feature(enable = "aes")] +pub(super) unsafe fn sm4_process4( + blocks: InOut<'_, '_, ParBlocks4>, + rk: &[u32; 32], + encrypt: bool, +) { + let (in_ptr, out_ptr) = blocks.into_raw(); + + let in_block_ptr: *const __m128i = in_ptr as *const _; + let mut b: [__m128i; 4] = [ + _mm_loadu_si128(in_block_ptr.add(0)), + _mm_loadu_si128(in_block_ptr.add(1)), + _mm_loadu_si128(in_block_ptr.add(2)), + _mm_loadu_si128(in_block_ptr.add(3)), + ]; + let vindex = _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + + let mut x: [__m128i; 4] = [ + mm_pack0_epi32(b[0], b[1], b[2], b[3]), + mm_pack1_epi32(b[0], b[1], b[2], b[3]), + mm_pack2_epi32(b[0], b[1], b[2], b[3]), + mm_pack3_epi32(b[0], b[1], b[2], b[3]), + ]; + + // Shuffle Endian + x[0] = _mm_shuffle_epi8(x[0], vindex); + x[1] = _mm_shuffle_epi8(x[1], vindex); + x[2] = _mm_shuffle_epi8(x[2], vindex); + x[3] = _mm_shuffle_epi8(x[3], vindex); + + for i in 0..32 { + let k = if encrypt { + _mm_set1_epi32(rk[i] as i32) + } else { + _mm_set1_epi32(rk[31 - i] as i32) + }; + b[0] = mm_xor4(x[1], x[2], x[3], k); + b[0] = sm4_sbox(b[0]); + b[0] = mm_xor6( + x[0], + b[0], + mm_rotl_epi32!(b[0], 2), + mm_rotl_epi32!(b[0], 10), + mm_rotl_epi32!(b[0], 18), + mm_rotl_epi32!(b[0], 24), + ); + + x[0] = x[1]; + x[1] = x[2]; + x[2] = x[3]; + x[3] = b[0]; + } + + x[0] = _mm_shuffle_epi8(x[0], vindex); + x[1] = _mm_shuffle_epi8(x[1], vindex); + x[2] = _mm_shuffle_epi8(x[2], vindex); + x[3] = _mm_shuffle_epi8(x[3], vindex); + + let out_block_ptr: *mut __m128i = out_ptr as *mut _; + _mm_storeu_si128(out_block_ptr.add(0), mm_pack0_epi32(x[3], x[2], x[1], x[0])); + _mm_storeu_si128(out_block_ptr.add(1), mm_pack1_epi32(x[3], x[2], x[1], x[0])); + _mm_storeu_si128(out_block_ptr.add(2), mm_pack2_epi32(x[3], x[2], x[1], x[0])); + _mm_storeu_si128(out_block_ptr.add(3), mm_pack3_epi32(x[3], x[2], x[1], x[0])); +} + +#[inline] +pub fn sm4_encrypt4(blocks: InOut<'_, '_, ParBlocks4>, rk: &[u32; 32]) { + unsafe { sm4_process4::(blocks, rk, true) } +} + +#[inline] +pub fn sm4_decrypt4(blocks: InOut<'_, '_, ParBlocks4>, rk: &[u32; 32]) { + unsafe { sm4_process4::(blocks, rk, false) } +} + +/// SM4 block cipher. +#[derive(Clone)] +pub struct Sm4 { + rk: [u32; 32], +} + +impl BlockCipher for Sm4 {} + +impl KeySizeUser for Sm4 { + type KeySize = U16; +} + +impl KeyInit for Sm4 { + fn new(key: &Key) -> Self { + Sm4 { + rk: crate::soft::sm4_init_key::(key), + } + } +} + +impl fmt::Debug for Sm4 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("Sm4 { ... }") + } +} + +impl AlgorithmName for Sm4 { + fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("Sm4") + } +} + +#[cfg(feature = "zeroize")] +#[cfg_attr(docsrs, doc(cfg(feature = "zeroize")))] +impl Drop for Sm4 { + fn drop(&mut self) { + self.rk.zeroize(); + } +} + +#[cfg(feature = "zeroize")] +#[cfg_attr(docsrs, doc(cfg(feature = "zeroize")))] +impl ZeroizeOnDrop for Sm4 {} + +impl BlockSizeUser for Sm4 { + type BlockSize = U16; +} + +impl BlockEncrypt for Sm4 { + fn encrypt_with_backend(&self, f: impl cipher::BlockClosure) { + f.call(&mut Sm4Enc(self)) + } +} + +pub struct Sm4Enc<'a>(&'a Sm4); + +impl<'a> BlockSizeUser for Sm4Enc<'a> { + type BlockSize = U16; +} + +impl<'a> ParBlocksSizeUser for Sm4Enc<'a> { + type ParBlocksSize = U4; +} + +impl<'a> BlockBackend for Sm4Enc<'a> { + #[inline(always)] + fn proc_block(&mut self, block: InOut<'_, '_, Block>) { + crate::soft::sm4_encrypt::(block, &self.0.rk); + } + + #[inline(always)] + fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, ParBlocks>) { + sm4_encrypt4::(blocks, &self.0.rk); + } +} + +impl BlockDecrypt for Sm4 { + fn decrypt_with_backend(&self, f: impl cipher::BlockClosure) { + f.call(&mut Sm4Dec(self)) + } +} + +pub struct Sm4Dec<'a>(&'a Sm4); + +impl<'a> BlockSizeUser for Sm4Dec<'a> { + type BlockSize = U16; +} + +impl<'a> ParBlocksSizeUser for Sm4Dec<'a> { + type ParBlocksSize = U4; +} + +impl<'a> BlockBackend for Sm4Dec<'a> { + #[inline(always)] + fn proc_block(&mut self, block: InOut<'_, '_, Block>) { + crate::soft::sm4_decrypt::(block, &self.0.rk); + } + + #[inline(always)] + fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, ParBlocks>) { + sm4_decrypt4::(blocks, &self.0.rk); + } +} diff --git a/sm4/src/x86/autodetect.rs b/sm4/src/x86/autodetect.rs new file mode 100644 index 00000000..0e81a69e --- /dev/null +++ b/sm4/src/x86/autodetect.rs @@ -0,0 +1,139 @@ +//! SM4 X86 + +#![allow(unsafe_code)] + +use cipher::{ + consts::U16, AlgorithmName, BlockCipher, BlockClosure, BlockDecrypt, BlockEncrypt, + BlockSizeUser, Key, KeyInit, KeySizeUser, +}; +use core::{fmt, mem::ManuallyDrop}; + +cpufeatures::new!(aes_intrinsics, "aes"); +cpufeatures::new!(avx2_intrinsics, "avx2"); + +union Sm4Cipher { + avx2: ManuallyDrop, + aesni: ManuallyDrop, + soft: ManuallyDrop, +} + +/// SM4 block cipher. +pub struct Sm4 { + cipher: Sm4Cipher, + aes_token: aes_intrinsics::InitToken, + avx2_token: avx2_intrinsics::InitToken, +} + +impl KeySizeUser for Sm4 { + type KeySize = U16; +} + +impl KeyInit for Sm4 { + #[inline] + fn new(key: &Key) -> Self { + let (aes_token, aes_presence) = aes_intrinsics::init_get(); + let (avx2_token, avx2_presence) = avx2_intrinsics::init_get(); + + let cipher = if aes_presence { + Sm4Cipher { + aesni: ManuallyDrop::new(super::aesni::Sm4::new(key)), + } + } else if avx2_presence { + Sm4Cipher { + avx2: ManuallyDrop::new(super::avx2::Sm4::new(key)), + } + } else { + Sm4Cipher { + soft: ManuallyDrop::new(crate::soft::Sm4::new(key)), + } + }; + + Self { + cipher, + aes_token, + avx2_token, + } + } +} + +impl Clone for Sm4 { + fn clone(&self) -> Self { + let cipher = if self.aes_token.get() { + Sm4Cipher { + aesni: unsafe { self.cipher.aesni.clone() }, + } + } else if self.avx2_token.get() { + Sm4Cipher { + avx2: unsafe { self.cipher.avx2.clone() }, + } + } else { + Sm4Cipher { + soft: unsafe { self.cipher.soft.clone() }, + } + }; + + Self { + cipher, + aes_token: self.aes_token, + avx2_token: self.avx2_token, + } + } +} + +impl BlockSizeUser for Sm4 { + type BlockSize = U16; +} + +impl BlockCipher for Sm4 {} + +impl BlockEncrypt for Sm4 { + fn encrypt_with_backend(&self, f: impl BlockClosure) { + unsafe { + if self.aes_token.get() { + self.cipher.aesni.encrypt_with_backend(f); + } else if self.avx2_token.get() { + self.cipher.avx2.encrypt_with_backend(f); + } else { + self.cipher.soft.encrypt_with_backend(f); + } + } + } +} + +impl BlockDecrypt for Sm4 { + fn decrypt_with_backend(&self, f: impl BlockClosure) { + unsafe { + if self.aes_token.get() { + self.cipher.aesni.decrypt_with_backend(f); + } else if self.avx2_token.get() { + self.cipher.avx2.decrypt_with_backend(f); + } else { + self.cipher.soft.encrypt_with_backend(f); + } + } + } +} + +impl fmt::Debug for Sm4 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + f.write_str(concat!(stringify!(Sm4), " { .. }")) + } +} + +impl AlgorithmName for Sm4 { + fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(stringify!(Sm4)) + } +} + +impl Drop for Sm4 { + fn drop(&mut self) { + if self.aes_token.get() { + unsafe { ManuallyDrop::drop(&mut self.cipher.aesni) } + } else if self.avx2_token.get() { + unsafe { ManuallyDrop::drop(&mut self.cipher.avx2) } + } else { + unsafe { ManuallyDrop::drop(&mut self.cipher.soft) } + } + } +} diff --git a/sm4/src/x86/avx2.rs b/sm4/src/x86/avx2.rs new file mode 100644 index 00000000..2a9dcde4 --- /dev/null +++ b/sm4/src/x86/avx2.rs @@ -0,0 +1,388 @@ +//! SM4 with AVX2 +//! +//! Implementation was borrowed from by kentle. + +#![allow(unsafe_code)] + +#[cfg(feature = "zeroize")] +use cipher::zeroize::{Zeroize, ZeroizeOnDrop}; +use cipher::{ + consts::{U16, U8}, + generic_array::GenericArray, + inout::InOut, + AlgorithmName, Block, BlockCipher, BlockDecrypt, BlockSizeUser, Key, KeyInit, KeySizeUser, + ParBlocks, ParBlocksSizeUser, +}; +use cipher::{BlockBackend, BlockEncrypt}; +use core::{arch::x86_64::*, fmt}; + +static BOX0: [u32; 256] = [ + 0xD55B5B8E, 0x924242D0, 0xEAA7A74D, 0xFDFBFB06, 0xCF3333FC, 0xE2878765, 0x3DF4F4C9, 0xB5DEDE6B, + 0x1658584E, 0xB4DADA6E, 0x14505044, 0xC10B0BCA, 0x28A0A088, 0xF8EFEF17, 0x2CB0B09C, 0x05141411, + 0x2BACAC87, 0x669D9DFB, 0x986A6AF2, 0x77D9D9AE, 0x2AA8A882, 0xBCFAFA46, 0x04101014, 0xC00F0FCF, + 0xA8AAAA02, 0x45111154, 0x134C4C5F, 0x269898BE, 0x4825256D, 0x841A1A9E, 0x0618181E, 0x9B6666FD, + 0x9E7272EC, 0x4309094A, 0x51414110, 0xF7D3D324, 0x934646D5, 0xECBFBF53, 0x9A6262F8, 0x7BE9E992, + 0x33CCCCFF, 0x55515104, 0x0B2C2C27, 0x420D0D4F, 0xEEB7B759, 0xCC3F3FF3, 0xAEB2B21C, 0x638989EA, + 0xE7939374, 0xB1CECE7F, 0x1C70706C, 0xABA6A60D, 0xCA2727ED, 0x08202028, 0xEBA3A348, 0x975656C1, + 0x82020280, 0xDC7F7FA3, 0x965252C4, 0xF9EBEB12, 0x74D5D5A1, 0x8D3E3EB3, 0x3FFCFCC3, 0xA49A9A3E, + 0x461D1D5B, 0x071C1C1B, 0xA59E9E3B, 0xFFF3F30C, 0xF0CFCF3F, 0x72CDCDBF, 0x175C5C4B, 0xB8EAEA52, + 0x810E0E8F, 0x5865653D, 0x3CF0F0CC, 0x1964647D, 0xE59B9B7E, 0x87161691, 0x4E3D3D73, 0xAAA2A208, + 0x69A1A1C8, 0x6AADADC7, 0x83060685, 0xB0CACA7A, 0x70C5C5B5, 0x659191F4, 0xD96B6BB2, 0x892E2EA7, + 0xFBE3E318, 0xE8AFAF47, 0x0F3C3C33, 0x4A2D2D67, 0x71C1C1B0, 0x5759590E, 0x9F7676E9, 0x35D4D4E1, + 0x1E787866, 0x249090B4, 0x0E383836, 0x5F797926, 0x628D8DEF, 0x59616138, 0xD2474795, 0xA08A8A2A, + 0x259494B1, 0x228888AA, 0x7DF1F18C, 0x3BECECD7, 0x01040405, 0x218484A5, 0x79E1E198, 0x851E1E9B, + 0xD7535384, 0x00000000, 0x4719195E, 0x565D5D0B, 0x9D7E7EE3, 0xD04F4F9F, 0x279C9CBB, 0x5349491A, + 0x4D31317C, 0x36D8D8EE, 0x0208080A, 0xE49F9F7B, 0xA2828220, 0xC71313D4, 0xCB2323E8, 0x9C7A7AE6, + 0xE9ABAB42, 0xBDFEFE43, 0x882A2AA2, 0xD14B4B9A, 0x41010140, 0xC41F1FDB, 0x38E0E0D8, 0xB7D6D661, + 0xA18E8E2F, 0xF4DFDF2B, 0xF1CBCB3A, 0xCD3B3BF6, 0xFAE7E71D, 0x608585E5, 0x15545441, 0xA3868625, + 0xE3838360, 0xACBABA16, 0x5C757529, 0xA6929234, 0x996E6EF7, 0x34D0D0E4, 0x1A686872, 0x54555501, + 0xAFB6B619, 0x914E4EDF, 0x32C8C8FA, 0x30C0C0F0, 0xF6D7D721, 0x8E3232BC, 0xB3C6C675, 0xE08F8F6F, + 0x1D747469, 0xF5DBDB2E, 0xE18B8B6A, 0x2EB8B896, 0x800A0A8A, 0x679999FE, 0xC92B2BE2, 0x618181E0, + 0xC30303C0, 0x29A4A48D, 0x238C8CAF, 0xA9AEAE07, 0x0D343439, 0x524D4D1F, 0x4F393976, 0x6EBDBDD3, + 0xD6575781, 0xD86F6FB7, 0x37DCDCEB, 0x44151551, 0xDD7B7BA6, 0xFEF7F709, 0x8C3A3AB6, 0x2FBCBC93, + 0x030C0C0F, 0xFCFFFF03, 0x6BA9A9C2, 0x73C9C9BA, 0x6CB5B5D9, 0x6DB1B1DC, 0x5A6D6D37, 0x50454515, + 0x8F3636B9, 0x1B6C6C77, 0xADBEBE13, 0x904A4ADA, 0xB9EEEE57, 0xDE7777A9, 0xBEF2F24C, 0x7EFDFD83, + 0x11444455, 0xDA6767BD, 0x5D71712C, 0x40050545, 0x1F7C7C63, 0x10404050, 0x5B696932, 0xDB6363B8, + 0x0A282822, 0xC20707C5, 0x31C4C4F5, 0x8A2222A8, 0xA7969631, 0xCE3737F9, 0x7AEDED97, 0xBFF6F649, + 0x2DB4B499, 0x75D1D1A4, 0xD3434390, 0x1248485A, 0xBAE2E258, 0xE6979771, 0xB6D2D264, 0xB2C2C270, + 0x8B2626AD, 0x68A5A5CD, 0x955E5ECB, 0x4B292962, 0x0C30303C, 0x945A5ACE, 0x76DDDDAB, 0x7FF9F986, + 0x649595F1, 0xBBE6E65D, 0xF2C7C735, 0x0924242D, 0xC61717D1, 0x6FB9B9D6, 0xC51B1BDE, 0x86121294, + 0x18606078, 0xF3C3C330, 0x7CF5F589, 0xEFB3B35C, 0x3AE8E8D2, 0xDF7373AC, 0x4C353579, 0x208080A0, + 0x78E5E59D, 0xEDBBBB56, 0x5E7D7D23, 0x3EF8F8C6, 0xD45F5F8B, 0xC82F2FE7, 0x39E4E4DD, 0x49212168, +]; + +static BOX1: [u32; 256] = [ + 0x5B5B8ED5, 0x4242D092, 0xA7A74DEA, 0xFBFB06FD, 0x3333FCCF, 0x878765E2, 0xF4F4C93D, 0xDEDE6BB5, + 0x58584E16, 0xDADA6EB4, 0x50504414, 0x0B0BCAC1, 0xA0A08828, 0xEFEF17F8, 0xB0B09C2C, 0x14141105, + 0xACAC872B, 0x9D9DFB66, 0x6A6AF298, 0xD9D9AE77, 0xA8A8822A, 0xFAFA46BC, 0x10101404, 0x0F0FCFC0, + 0xAAAA02A8, 0x11115445, 0x4C4C5F13, 0x9898BE26, 0x25256D48, 0x1A1A9E84, 0x18181E06, 0x6666FD9B, + 0x7272EC9E, 0x09094A43, 0x41411051, 0xD3D324F7, 0x4646D593, 0xBFBF53EC, 0x6262F89A, 0xE9E9927B, + 0xCCCCFF33, 0x51510455, 0x2C2C270B, 0x0D0D4F42, 0xB7B759EE, 0x3F3FF3CC, 0xB2B21CAE, 0x8989EA63, + 0x939374E7, 0xCECE7FB1, 0x70706C1C, 0xA6A60DAB, 0x2727EDCA, 0x20202808, 0xA3A348EB, 0x5656C197, + 0x02028082, 0x7F7FA3DC, 0x5252C496, 0xEBEB12F9, 0xD5D5A174, 0x3E3EB38D, 0xFCFCC33F, 0x9A9A3EA4, + 0x1D1D5B46, 0x1C1C1B07, 0x9E9E3BA5, 0xF3F30CFF, 0xCFCF3FF0, 0xCDCDBF72, 0x5C5C4B17, 0xEAEA52B8, + 0x0E0E8F81, 0x65653D58, 0xF0F0CC3C, 0x64647D19, 0x9B9B7EE5, 0x16169187, 0x3D3D734E, 0xA2A208AA, + 0xA1A1C869, 0xADADC76A, 0x06068583, 0xCACA7AB0, 0xC5C5B570, 0x9191F465, 0x6B6BB2D9, 0x2E2EA789, + 0xE3E318FB, 0xAFAF47E8, 0x3C3C330F, 0x2D2D674A, 0xC1C1B071, 0x59590E57, 0x7676E99F, 0xD4D4E135, + 0x7878661E, 0x9090B424, 0x3838360E, 0x7979265F, 0x8D8DEF62, 0x61613859, 0x474795D2, 0x8A8A2AA0, + 0x9494B125, 0x8888AA22, 0xF1F18C7D, 0xECECD73B, 0x04040501, 0x8484A521, 0xE1E19879, 0x1E1E9B85, + 0x535384D7, 0x00000000, 0x19195E47, 0x5D5D0B56, 0x7E7EE39D, 0x4F4F9FD0, 0x9C9CBB27, 0x49491A53, + 0x31317C4D, 0xD8D8EE36, 0x08080A02, 0x9F9F7BE4, 0x828220A2, 0x1313D4C7, 0x2323E8CB, 0x7A7AE69C, + 0xABAB42E9, 0xFEFE43BD, 0x2A2AA288, 0x4B4B9AD1, 0x01014041, 0x1F1FDBC4, 0xE0E0D838, 0xD6D661B7, + 0x8E8E2FA1, 0xDFDF2BF4, 0xCBCB3AF1, 0x3B3BF6CD, 0xE7E71DFA, 0x8585E560, 0x54544115, 0x868625A3, + 0x838360E3, 0xBABA16AC, 0x7575295C, 0x929234A6, 0x6E6EF799, 0xD0D0E434, 0x6868721A, 0x55550154, + 0xB6B619AF, 0x4E4EDF91, 0xC8C8FA32, 0xC0C0F030, 0xD7D721F6, 0x3232BC8E, 0xC6C675B3, 0x8F8F6FE0, + 0x7474691D, 0xDBDB2EF5, 0x8B8B6AE1, 0xB8B8962E, 0x0A0A8A80, 0x9999FE67, 0x2B2BE2C9, 0x8181E061, + 0x0303C0C3, 0xA4A48D29, 0x8C8CAF23, 0xAEAE07A9, 0x3434390D, 0x4D4D1F52, 0x3939764F, 0xBDBDD36E, + 0x575781D6, 0x6F6FB7D8, 0xDCDCEB37, 0x15155144, 0x7B7BA6DD, 0xF7F709FE, 0x3A3AB68C, 0xBCBC932F, + 0x0C0C0F03, 0xFFFF03FC, 0xA9A9C26B, 0xC9C9BA73, 0xB5B5D96C, 0xB1B1DC6D, 0x6D6D375A, 0x45451550, + 0x3636B98F, 0x6C6C771B, 0xBEBE13AD, 0x4A4ADA90, 0xEEEE57B9, 0x7777A9DE, 0xF2F24CBE, 0xFDFD837E, + 0x44445511, 0x6767BDDA, 0x71712C5D, 0x05054540, 0x7C7C631F, 0x40405010, 0x6969325B, 0x6363B8DB, + 0x2828220A, 0x0707C5C2, 0xC4C4F531, 0x2222A88A, 0x969631A7, 0x3737F9CE, 0xEDED977A, 0xF6F649BF, + 0xB4B4992D, 0xD1D1A475, 0x434390D3, 0x48485A12, 0xE2E258BA, 0x979771E6, 0xD2D264B6, 0xC2C270B2, + 0x2626AD8B, 0xA5A5CD68, 0x5E5ECB95, 0x2929624B, 0x30303C0C, 0x5A5ACE94, 0xDDDDAB76, 0xF9F9867F, + 0x9595F164, 0xE6E65DBB, 0xC7C735F2, 0x24242D09, 0x1717D1C6, 0xB9B9D66F, 0x1B1BDEC5, 0x12129486, + 0x60607818, 0xC3C330F3, 0xF5F5897C, 0xB3B35CEF, 0xE8E8D23A, 0x7373ACDF, 0x3535794C, 0x8080A020, + 0xE5E59D78, 0xBBBB56ED, 0x7D7D235E, 0xF8F8C63E, 0x5F5F8BD4, 0x2F2FE7C8, 0xE4E4DD39, 0x21216849, +]; + +static BOX2: [u32; 256] = [ + 0x5B8ED55B, 0x42D09242, 0xA74DEAA7, 0xFB06FDFB, 0x33FCCF33, 0x8765E287, 0xF4C93DF4, 0xDE6BB5DE, + 0x584E1658, 0xDA6EB4DA, 0x50441450, 0x0BCAC10B, 0xA08828A0, 0xEF17F8EF, 0xB09C2CB0, 0x14110514, + 0xAC872BAC, 0x9DFB669D, 0x6AF2986A, 0xD9AE77D9, 0xA8822AA8, 0xFA46BCFA, 0x10140410, 0x0FCFC00F, + 0xAA02A8AA, 0x11544511, 0x4C5F134C, 0x98BE2698, 0x256D4825, 0x1A9E841A, 0x181E0618, 0x66FD9B66, + 0x72EC9E72, 0x094A4309, 0x41105141, 0xD324F7D3, 0x46D59346, 0xBF53ECBF, 0x62F89A62, 0xE9927BE9, + 0xCCFF33CC, 0x51045551, 0x2C270B2C, 0x0D4F420D, 0xB759EEB7, 0x3FF3CC3F, 0xB21CAEB2, 0x89EA6389, + 0x9374E793, 0xCE7FB1CE, 0x706C1C70, 0xA60DABA6, 0x27EDCA27, 0x20280820, 0xA348EBA3, 0x56C19756, + 0x02808202, 0x7FA3DC7F, 0x52C49652, 0xEB12F9EB, 0xD5A174D5, 0x3EB38D3E, 0xFCC33FFC, 0x9A3EA49A, + 0x1D5B461D, 0x1C1B071C, 0x9E3BA59E, 0xF30CFFF3, 0xCF3FF0CF, 0xCDBF72CD, 0x5C4B175C, 0xEA52B8EA, + 0x0E8F810E, 0x653D5865, 0xF0CC3CF0, 0x647D1964, 0x9B7EE59B, 0x16918716, 0x3D734E3D, 0xA208AAA2, + 0xA1C869A1, 0xADC76AAD, 0x06858306, 0xCA7AB0CA, 0xC5B570C5, 0x91F46591, 0x6BB2D96B, 0x2EA7892E, + 0xE318FBE3, 0xAF47E8AF, 0x3C330F3C, 0x2D674A2D, 0xC1B071C1, 0x590E5759, 0x76E99F76, 0xD4E135D4, + 0x78661E78, 0x90B42490, 0x38360E38, 0x79265F79, 0x8DEF628D, 0x61385961, 0x4795D247, 0x8A2AA08A, + 0x94B12594, 0x88AA2288, 0xF18C7DF1, 0xECD73BEC, 0x04050104, 0x84A52184, 0xE19879E1, 0x1E9B851E, + 0x5384D753, 0x00000000, 0x195E4719, 0x5D0B565D, 0x7EE39D7E, 0x4F9FD04F, 0x9CBB279C, 0x491A5349, + 0x317C4D31, 0xD8EE36D8, 0x080A0208, 0x9F7BE49F, 0x8220A282, 0x13D4C713, 0x23E8CB23, 0x7AE69C7A, + 0xAB42E9AB, 0xFE43BDFE, 0x2AA2882A, 0x4B9AD14B, 0x01404101, 0x1FDBC41F, 0xE0D838E0, 0xD661B7D6, + 0x8E2FA18E, 0xDF2BF4DF, 0xCB3AF1CB, 0x3BF6CD3B, 0xE71DFAE7, 0x85E56085, 0x54411554, 0x8625A386, + 0x8360E383, 0xBA16ACBA, 0x75295C75, 0x9234A692, 0x6EF7996E, 0xD0E434D0, 0x68721A68, 0x55015455, + 0xB619AFB6, 0x4EDF914E, 0xC8FA32C8, 0xC0F030C0, 0xD721F6D7, 0x32BC8E32, 0xC675B3C6, 0x8F6FE08F, + 0x74691D74, 0xDB2EF5DB, 0x8B6AE18B, 0xB8962EB8, 0x0A8A800A, 0x99FE6799, 0x2BE2C92B, 0x81E06181, + 0x03C0C303, 0xA48D29A4, 0x8CAF238C, 0xAE07A9AE, 0x34390D34, 0x4D1F524D, 0x39764F39, 0xBDD36EBD, + 0x5781D657, 0x6FB7D86F, 0xDCEB37DC, 0x15514415, 0x7BA6DD7B, 0xF709FEF7, 0x3AB68C3A, 0xBC932FBC, + 0x0C0F030C, 0xFF03FCFF, 0xA9C26BA9, 0xC9BA73C9, 0xB5D96CB5, 0xB1DC6DB1, 0x6D375A6D, 0x45155045, + 0x36B98F36, 0x6C771B6C, 0xBE13ADBE, 0x4ADA904A, 0xEE57B9EE, 0x77A9DE77, 0xF24CBEF2, 0xFD837EFD, + 0x44551144, 0x67BDDA67, 0x712C5D71, 0x05454005, 0x7C631F7C, 0x40501040, 0x69325B69, 0x63B8DB63, + 0x28220A28, 0x07C5C207, 0xC4F531C4, 0x22A88A22, 0x9631A796, 0x37F9CE37, 0xED977AED, 0xF649BFF6, + 0xB4992DB4, 0xD1A475D1, 0x4390D343, 0x485A1248, 0xE258BAE2, 0x9771E697, 0xD264B6D2, 0xC270B2C2, + 0x26AD8B26, 0xA5CD68A5, 0x5ECB955E, 0x29624B29, 0x303C0C30, 0x5ACE945A, 0xDDAB76DD, 0xF9867FF9, + 0x95F16495, 0xE65DBBE6, 0xC735F2C7, 0x242D0924, 0x17D1C617, 0xB9D66FB9, 0x1BDEC51B, 0x12948612, + 0x60781860, 0xC330F3C3, 0xF5897CF5, 0xB35CEFB3, 0xE8D23AE8, 0x73ACDF73, 0x35794C35, 0x80A02080, + 0xE59D78E5, 0xBB56EDBB, 0x7D235E7D, 0xF8C63EF8, 0x5F8BD45F, 0x2FE7C82F, 0xE4DD39E4, 0x21684921, +]; + +static BOX3: [u32; 256] = [ + 0x8ED55B5B, 0xD0924242, 0x4DEAA7A7, 0x06FDFBFB, 0xFCCF3333, 0x65E28787, 0xC93DF4F4, 0x6BB5DEDE, + 0x4E165858, 0x6EB4DADA, 0x44145050, 0xCAC10B0B, 0x8828A0A0, 0x17F8EFEF, 0x9C2CB0B0, 0x11051414, + 0x872BACAC, 0xFB669D9D, 0xF2986A6A, 0xAE77D9D9, 0x822AA8A8, 0x46BCFAFA, 0x14041010, 0xCFC00F0F, + 0x02A8AAAA, 0x54451111, 0x5F134C4C, 0xBE269898, 0x6D482525, 0x9E841A1A, 0x1E061818, 0xFD9B6666, + 0xEC9E7272, 0x4A430909, 0x10514141, 0x24F7D3D3, 0xD5934646, 0x53ECBFBF, 0xF89A6262, 0x927BE9E9, + 0xFF33CCCC, 0x04555151, 0x270B2C2C, 0x4F420D0D, 0x59EEB7B7, 0xF3CC3F3F, 0x1CAEB2B2, 0xEA638989, + 0x74E79393, 0x7FB1CECE, 0x6C1C7070, 0x0DABA6A6, 0xEDCA2727, 0x28082020, 0x48EBA3A3, 0xC1975656, + 0x80820202, 0xA3DC7F7F, 0xC4965252, 0x12F9EBEB, 0xA174D5D5, 0xB38D3E3E, 0xC33FFCFC, 0x3EA49A9A, + 0x5B461D1D, 0x1B071C1C, 0x3BA59E9E, 0x0CFFF3F3, 0x3FF0CFCF, 0xBF72CDCD, 0x4B175C5C, 0x52B8EAEA, + 0x8F810E0E, 0x3D586565, 0xCC3CF0F0, 0x7D196464, 0x7EE59B9B, 0x91871616, 0x734E3D3D, 0x08AAA2A2, + 0xC869A1A1, 0xC76AADAD, 0x85830606, 0x7AB0CACA, 0xB570C5C5, 0xF4659191, 0xB2D96B6B, 0xA7892E2E, + 0x18FBE3E3, 0x47E8AFAF, 0x330F3C3C, 0x674A2D2D, 0xB071C1C1, 0x0E575959, 0xE99F7676, 0xE135D4D4, + 0x661E7878, 0xB4249090, 0x360E3838, 0x265F7979, 0xEF628D8D, 0x38596161, 0x95D24747, 0x2AA08A8A, + 0xB1259494, 0xAA228888, 0x8C7DF1F1, 0xD73BECEC, 0x05010404, 0xA5218484, 0x9879E1E1, 0x9B851E1E, + 0x84D75353, 0x00000000, 0x5E471919, 0x0B565D5D, 0xE39D7E7E, 0x9FD04F4F, 0xBB279C9C, 0x1A534949, + 0x7C4D3131, 0xEE36D8D8, 0x0A020808, 0x7BE49F9F, 0x20A28282, 0xD4C71313, 0xE8CB2323, 0xE69C7A7A, + 0x42E9ABAB, 0x43BDFEFE, 0xA2882A2A, 0x9AD14B4B, 0x40410101, 0xDBC41F1F, 0xD838E0E0, 0x61B7D6D6, + 0x2FA18E8E, 0x2BF4DFDF, 0x3AF1CBCB, 0xF6CD3B3B, 0x1DFAE7E7, 0xE5608585, 0x41155454, 0x25A38686, + 0x60E38383, 0x16ACBABA, 0x295C7575, 0x34A69292, 0xF7996E6E, 0xE434D0D0, 0x721A6868, 0x01545555, + 0x19AFB6B6, 0xDF914E4E, 0xFA32C8C8, 0xF030C0C0, 0x21F6D7D7, 0xBC8E3232, 0x75B3C6C6, 0x6FE08F8F, + 0x691D7474, 0x2EF5DBDB, 0x6AE18B8B, 0x962EB8B8, 0x8A800A0A, 0xFE679999, 0xE2C92B2B, 0xE0618181, + 0xC0C30303, 0x8D29A4A4, 0xAF238C8C, 0x07A9AEAE, 0x390D3434, 0x1F524D4D, 0x764F3939, 0xD36EBDBD, + 0x81D65757, 0xB7D86F6F, 0xEB37DCDC, 0x51441515, 0xA6DD7B7B, 0x09FEF7F7, 0xB68C3A3A, 0x932FBCBC, + 0x0F030C0C, 0x03FCFFFF, 0xC26BA9A9, 0xBA73C9C9, 0xD96CB5B5, 0xDC6DB1B1, 0x375A6D6D, 0x15504545, + 0xB98F3636, 0x771B6C6C, 0x13ADBEBE, 0xDA904A4A, 0x57B9EEEE, 0xA9DE7777, 0x4CBEF2F2, 0x837EFDFD, + 0x55114444, 0xBDDA6767, 0x2C5D7171, 0x45400505, 0x631F7C7C, 0x50104040, 0x325B6969, 0xB8DB6363, + 0x220A2828, 0xC5C20707, 0xF531C4C4, 0xA88A2222, 0x31A79696, 0xF9CE3737, 0x977AEDED, 0x49BFF6F6, + 0x992DB4B4, 0xA475D1D1, 0x90D34343, 0x5A124848, 0x58BAE2E2, 0x71E69797, 0x64B6D2D2, 0x70B2C2C2, + 0xAD8B2626, 0xCD68A5A5, 0xCB955E5E, 0x624B2929, 0x3C0C3030, 0xCE945A5A, 0xAB76DDDD, 0x867FF9F9, + 0xF1649595, 0x5DBBE6E6, 0x35F2C7C7, 0x2D092424, 0xD1C61717, 0xD66FB9B9, 0xDEC51B1B, 0x94861212, + 0x78186060, 0x30F3C3C3, 0x897CF5F5, 0x5CEFB3B3, 0xD23AE8E8, 0xACDF7373, 0x794C3535, 0xA0208080, + 0x9D78E5E5, 0x56EDBBBB, 0x235E7D7D, 0xC63EF8F8, 0x8BD45F5F, 0xE7C82F2F, 0xDD39E4E4, 0x68492121, +]; + +#[inline] +unsafe fn mm256_pack0_epi32(a: __m256i, b: __m256i, c: __m256i, d: __m256i) -> __m256i { + _mm256_unpacklo_epi64(_mm256_unpacklo_epi32(a, b), _mm256_unpacklo_epi32(c, d)) +} + +#[inline] +unsafe fn mm256_pack1_epi32(a: __m256i, b: __m256i, c: __m256i, d: __m256i) -> __m256i { + _mm256_unpackhi_epi64(_mm256_unpacklo_epi32(a, b), _mm256_unpacklo_epi32(c, d)) +} + +#[inline] +unsafe fn mm256_pack2_epi32(a: __m256i, b: __m256i, c: __m256i, d: __m256i) -> __m256i { + _mm256_unpacklo_epi64(_mm256_unpackhi_epi32(a, b), _mm256_unpackhi_epi32(c, d)) +} + +#[inline] +unsafe fn mm256_pack3_epi32(a: __m256i, b: __m256i, c: __m256i, d: __m256i) -> __m256i { + _mm256_unpackhi_epi64(_mm256_unpackhi_epi32(a, b), _mm256_unpackhi_epi32(c, d)) +} + +type ParBlocks8 = GenericArray, U8>; + +#[inline] +#[target_feature(enable = "avx2")] +pub(super) unsafe fn sm4_process8( + block: InOut<'_, '_, ParBlocks8>, + rk: &[u32; 32], + encrypt: bool, +) { + let mask = _mm256_set1_epi32(0xFF); + + let (in_ptr, out_ptr) = block.into_raw(); + let in_block2_ptr = in_ptr as *const __m256i; + + let mut b: [__m256i; 4] = [ + _mm256_loadu_si256(in_block2_ptr.add(0)), + _mm256_loadu_si256(in_block2_ptr.add(1)), + _mm256_loadu_si256(in_block2_ptr.add(2)), + _mm256_loadu_si256(in_block2_ptr.add(3)), + ]; + + let mut x: [__m256i; 4] = [ + mm256_pack0_epi32(b[0], b[1], b[2], b[3]), + mm256_pack1_epi32(b[0], b[1], b[2], b[3]), + mm256_pack2_epi32(b[0], b[1], b[2], b[3]), + mm256_pack3_epi32(b[0], b[1], b[2], b[3]), + ]; + + let vindex = _mm256_setr_epi8( + 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, + 15, 14, 13, 12, + ); + + x[0] = _mm256_shuffle_epi8(x[0], vindex); + x[1] = _mm256_shuffle_epi8(x[1], vindex); + x[2] = _mm256_shuffle_epi8(x[2], vindex); + x[3] = _mm256_shuffle_epi8(x[3], vindex); + + for i in 0..32 { + let k = if encrypt { + _mm256_set1_epi32(rk[i] as i32) + } else { + _mm256_set1_epi32(rk[31 - i] as i32) + }; + + b[0] = _mm256_xor_si256(_mm256_xor_si256(x[1], x[2]), _mm256_xor_si256(x[3], k)); + + b[1] = _mm256_xor_si256( + x[0], + _mm256_i32gather_epi32::<4>(BOX0.as_ptr() as *const _, _mm256_and_si256(b[0], mask)), + ); + b[0] = _mm256_srli_epi32(b[0], 8); + b[1] = _mm256_xor_si256( + b[1], + _mm256_i32gather_epi32::<4>(BOX1.as_ptr() as *const _, _mm256_and_si256(b[0], mask)), + ); + b[0] = _mm256_srli_epi32(b[0], 8); + b[1] = _mm256_xor_si256( + b[1], + _mm256_i32gather_epi32::<4>(BOX2.as_ptr() as *const _, _mm256_and_si256(b[0], mask)), + ); + b[0] = _mm256_srli_epi32(b[0], 8); + b[1] = _mm256_xor_si256( + b[1], + _mm256_i32gather_epi32::<4>(BOX3.as_ptr() as *const _, _mm256_and_si256(b[0], mask)), + ); + + x[0] = x[1]; + x[1] = x[2]; + x[2] = x[3]; + x[3] = b[1]; + } + + x[0] = _mm256_shuffle_epi8(x[0], vindex); + x[1] = _mm256_shuffle_epi8(x[1], vindex); + x[2] = _mm256_shuffle_epi8(x[2], vindex); + x[3] = _mm256_shuffle_epi8(x[3], vindex); + + let out_block2_ptr = out_ptr as *mut __m256i; + _mm256_storeu_si256( + out_block2_ptr.add(0), + mm256_pack0_epi32(x[3], x[2], x[1], x[0]), + ); + _mm256_storeu_si256( + out_block2_ptr.add(1), + mm256_pack1_epi32(x[3], x[2], x[1], x[0]), + ); + _mm256_storeu_si256( + out_block2_ptr.add(2), + mm256_pack2_epi32(x[3], x[2], x[1], x[0]), + ); + _mm256_storeu_si256( + out_block2_ptr.add(3), + mm256_pack3_epi32(x[3], x[2], x[1], x[0]), + ); +} + +#[inline] +pub fn sm4_encrypt8(blocks: InOut<'_, '_, ParBlocks8>, rk: &[u32; 32]) { + unsafe { sm4_process8::(blocks, rk, true) } +} + +#[inline] +pub fn sm4_decrypt8(blocks: InOut<'_, '_, ParBlocks8>, rk: &[u32; 32]) { + unsafe { sm4_process8::(blocks, rk, false) } +} + +/// SM4 block cipher. +#[derive(Clone)] +pub struct Sm4 { + rk: [u32; 32], +} + +impl BlockCipher for Sm4 {} + +impl KeySizeUser for Sm4 { + type KeySize = U16; +} + +impl KeyInit for Sm4 { + fn new(key: &Key) -> Self { + Sm4 { + rk: crate::soft::sm4_init_key::(key), + } + } +} + +impl fmt::Debug for Sm4 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("Sm4 { ... }") + } +} + +impl AlgorithmName for Sm4 { + fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("Sm4") + } +} + +#[cfg(feature = "zeroize")] +#[cfg_attr(docsrs, doc(cfg(feature = "zeroize")))] +impl Drop for Sm4 { + fn drop(&mut self) { + self.rk.zeroize(); + } +} + +#[cfg(feature = "zeroize")] +#[cfg_attr(docsrs, doc(cfg(feature = "zeroize")))] +impl ZeroizeOnDrop for Sm4 {} + +impl BlockSizeUser for Sm4 { + type BlockSize = U16; +} + +impl BlockEncrypt for Sm4 { + fn encrypt_with_backend(&self, f: impl cipher::BlockClosure) { + f.call(&mut Sm4Enc(self)) + } +} + +pub struct Sm4Enc<'a>(&'a Sm4); + +impl<'a> BlockSizeUser for Sm4Enc<'a> { + type BlockSize = U16; +} + +impl<'a> ParBlocksSizeUser for Sm4Enc<'a> { + type ParBlocksSize = U8; +} + +impl<'a> BlockBackend for Sm4Enc<'a> { + #[inline(always)] + fn proc_block(&mut self, block: InOut<'_, '_, Block>) { + crate::soft::sm4_encrypt::(block, &self.0.rk); + } + + #[inline(always)] + fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, ParBlocks>) { + sm4_encrypt8::(blocks, &self.0.rk); + } +} + +impl BlockDecrypt for Sm4 { + fn decrypt_with_backend(&self, f: impl cipher::BlockClosure) { + f.call(&mut Sm4Dec(self)) + } +} + +pub struct Sm4Dec<'a>(&'a Sm4); + +impl<'a> BlockSizeUser for Sm4Dec<'a> { + type BlockSize = U16; +} + +impl<'a> ParBlocksSizeUser for Sm4Dec<'a> { + type ParBlocksSize = U8; +} + +impl<'a> BlockBackend for Sm4Dec<'a> { + #[inline(always)] + fn proc_block(&mut self, block: InOut<'_, '_, Block>) { + crate::soft::sm4_decrypt::(block, &self.0.rk); + } + + #[inline(always)] + fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, ParBlocks>) { + sm4_decrypt8::(blocks, &self.0.rk); + } +} diff --git a/sm4/src/x86/mod.rs b/sm4/src/x86/mod.rs new file mode 100644 index 00000000..ea50acd4 --- /dev/null +++ b/sm4/src/x86/mod.rs @@ -0,0 +1,5 @@ +//! SM4 X86 SIMD implementation + +mod aesni; +pub mod autodetect; +mod avx2; diff --git a/sm4/tests/mod.rs b/sm4/tests/mod.rs index db1f4149..798384ec 100644 --- a/sm4/tests/mod.rs +++ b/sm4/tests/mod.rs @@ -1,6 +1,6 @@ //! Test vectors are from GM/T 0002-2012 -use cipher::{BlockDecrypt, BlockEncrypt, KeyInit}; +use cipher::{Block, BlockDecrypt, BlockEncrypt, KeyInit}; use hex_literal::hex; use sm4::Sm4; @@ -39,3 +39,52 @@ fn sm4_example_2() { } assert_eq!(&plaintext, block.as_slice()); } + +#[test] +fn sm4_example_1_blocks() { + let key = hex!("0123456789abcdeffedcba9876543210"); + let plaintext: [Block; 15] = [ + key.into(), + key.into(), + key.into(), + key.into(), + key.into(), + key.into(), + key.into(), + key.into(), + key.into(), + key.into(), + key.into(), + key.into(), + key.into(), + key.into(), + key.into(), + ]; + let ciphertext_b = hex!("681EDF34D206965E86B3E94F536E4246"); + let ciphertext: [Block; 15] = [ + ciphertext_b.into(), + ciphertext_b.into(), + ciphertext_b.into(), + ciphertext_b.into(), + ciphertext_b.into(), + ciphertext_b.into(), + ciphertext_b.into(), + ciphertext_b.into(), + ciphertext_b.into(), + ciphertext_b.into(), + ciphertext_b.into(), + ciphertext_b.into(), + ciphertext_b.into(), + ciphertext_b.into(), + ciphertext_b.into(), + ]; + let cipher = Sm4::new(&key.into()); + + let mut blocks = plaintext; + cipher.encrypt_blocks(&mut blocks); + + assert_eq!(&ciphertext, &blocks); + + cipher.decrypt_blocks(&mut blocks); + assert_eq!(&plaintext, &blocks); +}