mobilecoinfoundation · remoun · Mar 11, 2022 · Mar 2, 2022 · Mar 2, 2022 · Mar 2, 2022
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/aligned-cmov/Cargo.toml b/aligned-cmov/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "aligned-cmov"
-version = "2.0.0"
+version = "2.1.0"
 description = "Fast constant-time conditional moves of aligned bytes"
 authors = ["MobileCoin"]
 license = "GPL-3.0"

diff --git a/aligned-cmov/src/cmov_impl_asm.rs b/aligned-cmov/src/cmov_impl_asm.rs
@@ -1,4 +1,4 @@
-// Copyright (c) 2018-2021 The MobileCoin Foundation
+// Copyright (c) 2018-2022 The MobileCoin Foundation
 
 //! Implementation of cmov on x86-64 using inline assembly.
 //!
@@ -19,45 +19,41 @@
 //! For now it seems simplest to use inline assembly for all of it.
 
 use super::{A64Bytes, A8Bytes, ArrayLength};
+use core::arch::asm;
 
 // CMov for u32 values
 #[inline]
 pub fn cmov_u32(condition: bool, src: &u32, dest: &mut u32) {
-    // Create a temporary to be assigned to a register for cmov
-    let mut temp: u32 = *dest;
-    // Test condition ($1)
-    // cmovnz from src into temp ($2 to $0)
     unsafe {
-        llvm_asm!("test $1, $1
-                   cmovnz $0, $2"
-                    :"+&r"(temp)
-                    : "r"(condition), "rm"(*src)
-                    : "cc"
-                    : "volatile", "intel");
+        asm!(
+            // Set ZF if cond==0
+            "test {0}, {0}",
+            // Conditionally move src into temp (based on ZF)
+            // `:e` formats to the 32-bit register.
+            "cmovnz {2:e}, {1:e}",
+            in(reg_byte) condition as u8,
+            in(reg) *src,
+            // inout since we might not write, so we need the existing value.
+            inout(reg) *dest,
+        );
     }
-    // "cc" is because we are setting flags in test
-    // "volatile" is meant to discourage the optimizer from inspecting this
-    *dest = temp;
 }
 
 // CMov for u64 values
 #[inline]
 pub fn cmov_u64(condition: bool, src: &u64, dest: &mut u64) {
-    // Create a temporary to be assigned to a register for cmov
-    let mut temp: u64 = *dest;
-    // Test condition ($1)
-    // cmovnz from src into temp ($2 to $0)
     unsafe {
-        llvm_asm!("test $1, $1
-                   cmovnz $0, $2"
-                    :"+&r"(temp)
-                    : "r"(condition), "rm"(*src)
-                    : "cc"
-                    : "volatile", "intel");
+        asm!(
+            // Set ZF if cond==0
+            "test {0}, {0}",
+            // Conditionally move src into dest (based on ZF)
+            "cmovnz {2}, {1}",
+            in(reg_byte) condition as u8,
+            in(reg) *src,
+            // inout since we might not write, so we need the existing value.
+            inout(reg) *dest,
+        );
     }
-    // "cc" is because we are setting flags in test
-    // "volatile" is meant to discourage the optimizer from inspecting this
-    *dest = temp;
 }
 
 // CMov for i32 values
@@ -124,7 +120,6 @@ pub fn cmov_a64_bytes<N: ArrayLength<u8>>(
 // If we have avx2 then we can use cmov_byte_slice_a64 implementation
 #[cfg(target_feature = "avx2")]
 #[inline]
-#[allow(unused)]
 pub fn cmov_a64_bytes<N: ArrayLength<u8>>(
     condition: bool,
     src: &A64Bytes<N>,
@@ -151,13 +146,8 @@ pub fn cmov_a64_bytes<N: ArrayLength<u8>>(
 // if condition { memcpy(dest, src, count * 8) }
 // for pointers aligned to 8 byte boundary. Assumes count > 0
 #[inline]
-#[allow(unused)]
-unsafe fn cmov_byte_slice_a8(condition: bool, src: *const u64, dest: *mut u64, mut count: usize) {
-    // Create a temporary to be assigned to a register for cmov
-    // We also use it to pass the condition in, which is only needed in a register
-    // before entering the loop.
-    let mut temp: u64 = condition as u64;
-
+unsafe fn cmov_byte_slice_a8(condition: bool, src: *const u64, dest: *mut u64, count: usize) {
+    debug_assert!(count > 0, "count cannot be 0");
     // The idea here is to test once before we enter the loop and reuse the test
     // result for every cmov.
     // The loop is a dec, jnz loop.
@@ -167,109 +157,105 @@ unsafe fn cmov_byte_slice_a8(condition: bool, src: *const u64, dest: *mut u64, m
     // indexing. Because dec will clobber ZF, we can't use cmovnz. We store
     // the result of outer test in CF which is not clobbered by dec.
     // cmovc is contingent on CF
-    // negb is used to set CF to 1 iff the condition value was 1.
+    // neg is used to set CF to 1 iff the condition value was 1.
     // Pity, we cannot cmov directly to memory
     //
-    // temp is a "register output" because this is the way to obtain a scratch
-    // register when we don't care what actual register is used and we want the
-    // compiler to pick.
-    //
     // count is modified by the assembly -- for this reason it has to be labelled
     // as an input and an output, otherwise its illegal to modify it.
     //
-    // The condition is passed in through temp, then neg moves the value to CF.
-    // After that we don't need condition in a register, so temp register can be
+    // The condition is passed in through cond, then neg moves the value to CF.
+    // After that we don't need condition in a register, so cond register can be
     // reused.
-    llvm_asm!("neg $0
-               loop_body_${:uid}:
-                 mov $0, [$3 + 8*$1 - 8]
-                 cmovc $0, [$2 + 8*$1 - 8]
-                 mov [$3 + 8*$1 - 8], $0
-                 dec $1
-                 jnz loop_body_${:uid}"
-            : "+&r"(temp), "+&r"(count)
-            : "r"(src), "r"(dest)
-            : "cc", "memory"
-            : "volatile", "intel");
-    // cc is because we are setting flags in test
-    // memory is because we are dereferencing a bunch of pointers in asm
-    // volatile is because the output variable "temp" is not the true output
-    // of this block, the memory side-effects are.
+    asm!(
+        // Sets CF=0 iff cond==0, 1 otherwise.
+        "neg {0}",
+        // Must use numeric local labels, since this block may be inlined multiple times.
+        // https://doc.rust-lang.org/nightly/rust-by-example/unsafe/asm.html#labels
+        "42:",
+            // Copy into temp register from dest (for NOP default).
+            "mov {0}, [{3} + 8*{1} - 8]",
+            // Conditionally copy into temp register from src (based on CF).
+            "cmovc {0}, [{2} + 8*{1} - 8]",
+            // Copy from temp register into dest.
+            "mov [{3} + 8*{1} - 8], {0}",
+            // Decrement count. Sets ZF=1 when we reach 0.
+            "dec {1}",
+            // If ZF is not set, loop.
+            "jnz 42b",
+        // Discard outputs; the memory side effects are what's desired.
+        inout(reg) (condition as u64) => _,
+        inout(reg) count => _,
+        in(reg) src,
+        in(reg) dest,
+    );
 }
 
 // Should be a constant time function equivalent to:
 // if condition { memcpy(dest, src, num_bytes) }
 // for pointers aligned to *64* byte boundary. Will fault if that is not the
-// case. Assumes num_bytes > 0, and num_bytes divisible by 64!
+// case. Requires num_bytes > 0, and num_bytes divisible by 64.
 // This version uses AVX2 256-bit moves
 #[cfg(target_feature = "avx2")]
 #[inline]
-#[allow(unused)]
 unsafe fn cmov_byte_slice_a64(condition: bool, src: *const u64, dest: *mut u64, num_bytes: usize) {
-    debug_assert!(
-        num_bytes > 0,
-        "num_bytes cannot be 0, caller must handle that"
-    );
-    debug_assert!(
-        num_bytes % 64 == 0,
-        "num_bytes must be divisible by 64, caller must handle that"
-    );
-    // Similarly as before, we want to test once and use the test result for the
-    // whole loop.
-    //
-    // Before we enter the loop, we want to set ymm1 to all 0s or all 1s, depending
-    // on condition. We use neg to make it all 0s or all 1s 64bit, then vmovq to
-    // move that to xmm2, then vbroadcastsd to fill ymm1 with 1s or zeros.
-    //
-    // This time the cmov mechanism is:
-    // - VMOVDQA to load the source into a ymm register,
-    // - VPMASKMOVQ to move that to memory, after masking it with ymm1.
-    // An alternative approach which I didn't test is, MASKMOV to another ymm
-    // register, then move that register to memory.
-    //
+    debug_assert!(num_bytes > 0, "num_bytes cannot be 0");
+    debug_assert!(num_bytes % 64 == 0, "num_bytes must be divisible by 64");
+
     // Notes:
-    // temp = $0 is the scratch register
-    // We aren't allowed to modify condition or num_bytes = $3 unless
-    // we annotate them appropriately as outputs.
-    // So, num_bytes is an in/out register, and we pass condition in via
-    // the scratch register, instead of a dedicated register.
+    // temp = {0} is the scratch register
     //
-    // Once we have the mask in ymm1 we don't need the condition in a register
-    // anymore. Then, $0 is the loop variable, counting down from num_bytes in
-    // steps of 64
-    //
-    // The values of $0 in the loop are num_bytes, num_bytes - 64, ... 64,
+    // The values of {0} in the loop are num_bytes, num_bytes - 64, ... 64,
     // rather than num_bytes - 64 ... 0. This is because the semantics of the loop
     // are subtract 64, then test for 0, rather than test for 0, then subtract.
-    // So when we index using $0, we subtract 64 to compensate.
-    //
-    // We unroll the loop once because we can assume 64 byte alignment, but ymm
-    // register holds 32 bytes. So one round of vmovdqa, vpmaskmovq moves 32 bytes.
-    // So we do this twice in one pass through the loop.
-    //
-    // TODO: In AVX512 zmm registers we could move 64 bytes at once...
+    // So when we index using {0}, we subtract 64 to compensate.
     // TODO: Does unrolling the loop more help?
-    let mut temp: u64 = condition as u64;
-
-    llvm_asm!("neg $0
-               vmovq xmm2, $0
-               vbroadcastsd ymm1, xmm2
-               mov $0, $3
-               loop_body2_${:uid}:
-                 vmovdqa ymm2, [$1 + $0 - 64]
-                 vpmaskmovq [$2 + $0 - 64], ymm1, ymm2
-                 vmovdqa ymm3, [$1 + $0 - 32]
-                 vpmaskmovq [$2 + $0 - 32], ymm1, ymm3
-                 sub $0, 64
-                 jnz loop_body2_${:uid}"
-            : "+&r"(temp)
-            : "r"(src), "r"(dest), "rmi"(num_bytes)
-            : "cc", "memory", "ymm1", "ymm2", "ymm3"
-            : "volatile", "intel");
-    // cc is because we are setting flags
-    // memory is because we are dereferencing a bunch of pointers in asm
-    // volatile is because the output variable "temp" is not the true output
-    // of this block, the memory side-effects are.
+    asm!(
+        // Similarly to cmov_byte_slice_a8, we want to test once and use the
+        // result for the whole loop.
+        // Before we enter the loop, we want to set ymm1 to all 0s or all 1s,
+        // depending on condition. We use neg to make all 64 bits 0s or all 1s
+        // (since neg(0) = 0 and neg(1) = -1 = 11111111b in two's complement),
+        // then vmovq to move that to xmm2, then vbroadcastsd to fill ymm1
+        // with ones or zeros.
+        "neg {0}",
+        "vmovq xmm2, {0}",
+        "vbroadcastsd ymm1, xmm2",
+        // Once we have the mask in ymm1 we don't need the condition in
+        // a register anymore. Then, {0} is the loop variable, counting down
+        // from num_bytes in steps of 64
+        "mov {0}, {3}",
+        // Must use numeric local labels, since this block may be inlined multiple times.
+        // https://doc.rust-lang.org/nightly/rust-by-example/unsafe/asm.html#labels
+        "42:",
+            // This time the cmov mechanism is:
+            // - VMOVDQA to load the source into a ymm register,
+            // - VPMASKMOVQ to move that to memory, after masking it with ymm1.
+            // An alternative approach which I didn't test is, MASKMOV to
+            // another ymm register, then move that register to memory.
+            "vmovdqa ymm2, [{1} + {0} - 64]",
+            "vpmaskmovq [{2} + {0} - 64], ymm1, ymm2",
+            // We unroll the loop once because we can assume 64 byte alignment,
+            // but ymm register holds 32 bytes. So one round of
+            // vmovdqa+vpmaskmovq moves 32 bytes.
+            // So we do this twice in one pass through the loop.
+            //
+            // TODO: In AVX512 zmm registers we could move 64 bytes at once...
+            "vmovdqa ymm3, [{1} + {0} - 32]",
+            "vpmaskmovq [{2} + {0} - 32], ymm1, ymm3",
+            // Decrement num_bytes. Sets ZF=1 when we reach 0.
+            "sub {0}, 64",
+            // If ZF is not set, loop.
+            "jnz 42b",
+        // Discard output; the memory side effects are what's desired.
+        inout(reg) condition as u64 => _,
+        in(reg) src,
+        in(reg) dest,
+        in(reg) num_bytes,
+        // Scratch/Temp registers.
+        out("ymm1") _,
+        out("ymm2") _,
+        out("ymm3") _,
+    );
 }
 
 // TODO: In avx512 there is vmovdqa which takes a mask register, and kmovq to

diff --git a/aligned-cmov/src/lib.rs b/aligned-cmov/src/lib.rs
@@ -1,7 +1,6 @@
 // Copyright (c) 2018-2021 The MobileCoin Foundation
 
 #![no_std]
-#![feature(llvm_asm)]
 
 pub use aligned_array::{subtle, Aligned, AsAlignedChunks, AsNeSlice, A64, A8};
 pub use generic_array::{arr, typenum, ArrayLength, GenericArray};

diff --git a/balanced-tree-index/Cargo.toml b/balanced-tree-index/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "balanced-tree-index"
-version = "2.0.0"
+version = "2.1.0"
 description = "Utilities for constant-time manipulation of a complete binary tree with a flat in-memory representation."
 authors = ["MobileCoin"]
 license = "GPL-3.0"
@@ -11,7 +11,7 @@ keywords = ["binary-tree", "constant-time", "utilities", "no_std"]
 categories = ["cryptography", "data-structures"]
 
 [dependencies]
-aligned-cmov = { path = "../aligned-cmov", version = "2" }
+aligned-cmov = { path = "../aligned-cmov", version = "2.1" }
 
 rand_core = { version = "0.6", default-features = false }
 

diff --git a/mc-oblivious-map/Cargo.toml b/mc-oblivious-map/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "mc-oblivious-map"
-version = "2.0.0"
+version = "2.1.0"
 description = "Implementation of Oblivious Hash Map data structures on top of Oblivious RAM"
 authors = ["MobileCoin"]
 license = "GPL-3.0"
@@ -15,7 +15,7 @@ no_asm_insecure = ["aligned-cmov/no_asm_insecure"]
 
 [dependencies]
 aligned-array = { version = "1", features = ["subtle"] }
-aligned-cmov = { path = "../aligned-cmov", version = "2" }
+aligned-cmov = { path = "../aligned-cmov", version = "2.1" }
 mc-oblivious-traits = { path = "../mc-oblivious-traits", version = "2" }
 
 generic-array = { version = "0.14", default-features = false }

diff --git a/mc-oblivious-ram/Cargo.toml b/mc-oblivious-ram/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "mc-oblivious-ram"
-version = "2.0.0"
+version = "2.1.0"
 description = "Implementations of Oblivious RAM data structures"
 authors = ["MobileCoin"]
 license = "GPL-3.0"
@@ -14,7 +14,7 @@ categories = ["cryptography", "data-structures", "no-std"]
 no_asm_insecure = ["aligned-cmov/no_asm_insecure"]
 
 [dependencies]
-aligned-cmov = { path = "../aligned-cmov", version = "2" }
+aligned-cmov = { path = "../aligned-cmov", version = "2.1" }
 balanced-tree-index = { path = "../balanced-tree-index", version = "2" }
 mc-oblivious-traits = { path = "../mc-oblivious-traits", version = "2" }