riscv: P extension intrinsics for packed SIMD (part 1)

luojia65 · luojia65 · commit e6bc2611e208 · 2022-09-09T16:02:33.000+08:00
Implement by inline assembly for now, uses `pure, nomem, nostack` for
all packed simd arithmetic instructions. Uses `inlateout` when it
requires using the same register for input and output.

This commit also includes a rearrangement of shared risc-v architecture
module to improve documents. It includes a doc test fix as well.
diff --git a/crates/core_arch/src/mod.rs b/crates/core_arch/src/mod.rs
@@ -3,6 +3,9 @@
 #[macro_use]
 mod macros;
 
+#[cfg(any(target_arch = "riscv32", target_arch = "riscv64", doc))]
+mod riscv_shared;
+
 #[cfg(any(target_arch = "arm", target_arch = "aarch64", doc))]
 mod arm_shared;
 
@@ -276,10 +279,6 @@ mod aarch64;
 #[doc(cfg(any(target_arch = "arm")))]
 mod arm;
 
-#[cfg(any(target_arch = "riscv32", target_arch = "riscv64", doc))]
-#[doc(cfg(any(target_arch = "riscv32", target_arch = "riscv64")))]
-mod riscv_shared;
-
 #[cfg(any(target_arch = "riscv64", doc))]
 #[doc(cfg(any(target_arch = "riscv64")))]
 mod riscv64;
diff --git a/crates/core_arch/src/riscv_shared/mod.rs b/crates/core_arch/src/riscv_shared/mod.rs
@@ -1,4 +1,7 @@
 //! Shared RISC-V intrinsics
+mod p;
+
+pub use p::*;
 
 use crate::arch::asm;
 
@@ -674,12 +677,17 @@ pub fn sm3p1(x: u32) -> u32 {
 /// It can be implemented by `sm4ed` instruction like:
 ///
 /// ```no_run
+/// # #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
+/// # fn round_function(x0: u32, x1: u32, x2: u32, x3: u32, rk: u32) -> u32 {
+/// # #[cfg(target_arch = "riscv32")] use core::arch::riscv32::sm4ed;
+/// # #[cfg(target_arch = "riscv64")] use core::arch::riscv64::sm4ed;
 /// let a = x1 ^ x2 ^ x3 ^ rk;
 /// let c0 = sm4ed::<0>(x0, a);
 /// let c1 = sm4ed::<1>(c0, a); // c1 represents c[0..=1], etc.
 /// let c2 = sm4ed::<2>(c1, a);
 /// let c3 = sm4ed::<3>(c2, a);
 /// return c3; // c3 represents c[0..=3]
+/// # }
 /// ```
 ///
 /// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
@@ -739,12 +747,17 @@ pub fn sm4ed<const BS: u8>(x: u32, a: u32) -> u32 {
 /// Hence, the key schedule operation can be implemented by `sm4ks` instruction like:
 ///
 /// ```no_run
+/// # #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
+/// # fn key_schedule(k0: u32, k1: u32, k2: u32, k3: u32, ck_i: u32) -> u32 {
+/// # #[cfg(target_arch = "riscv32")] use core::arch::riscv32::sm4ks;
+/// # #[cfg(target_arch = "riscv64")] use core::arch::riscv64::sm4ks;
 /// let k = k1 ^ k2 ^ k3 ^ ck_i;
 /// let c0 = sm4ks::<0>(k0, k);
 /// let c1 = sm4ks::<1>(c0, k); // c1 represents c[0..=1], etc.
 /// let c2 = sm4ks::<2>(c1, k);
 /// let c3 = sm4ks::<3>(c2, k);
 /// return c3; // c3 represents c[0..=3]
+/// # }
 /// ```
 ///
 /// According to RISC-V Cryptography Extensions, Volume I, the execution latency of
diff --git a/crates/core_arch/src/riscv_shared/p.rs b/crates/core_arch/src/riscv_shared/p.rs