Add sqrt and more rsqrte neon instructions (#1078)

CryZe · web-flow · commit 78f1391fdc20 · 2021-03-13T14:27:44.000Z
This adds instructions for sqrt and some of the missing reciprocal
square-root estimate instructions.
diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
@@ -17,7 +17,7 @@ pub unsafe fn vabd_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
     #[allow(improper_ctypes)]
     extern "C" {
         #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fabd.v1f64")]
-        fn vabd_f64_(a: float64x1_t, a: float64x1_t) -> float64x1_t;
+        fn vabd_f64_(a: float64x1_t, b: float64x1_t) -> float64x1_t;
     }
     vabd_f64_(a, b)
 }
@@ -30,7 +30,7 @@ pub unsafe fn vabdq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
         #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fabd.v2f64")]
-        fn vabdq_f64_(a: float64x2_t, a: float64x2_t) -> float64x2_t;
+        fn vabdq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
     }
     vabdq_f64_(a, b)
 }
@@ -1087,7 +1087,7 @@ pub unsafe fn vmax_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
     #[allow(improper_ctypes)]
     extern "C" {
         #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmax.v1f64")]
-        fn vmax_f64_(a: float64x1_t, a: float64x1_t) -> float64x1_t;
+        fn vmax_f64_(a: float64x1_t, b: float64x1_t) -> float64x1_t;
     }
     vmax_f64_(a, b)
 }
@@ -1100,7 +1100,7 @@ pub unsafe fn vmaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
         #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmax.v2f64")]
-        fn vmaxq_f64_(a: float64x2_t, a: float64x2_t) -> float64x2_t;
+        fn vmaxq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
     }
     vmaxq_f64_(a, b)
 }
@@ -1113,7 +1113,7 @@ pub unsafe fn vmin_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t {
     #[allow(improper_ctypes)]
     extern "C" {
         #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmin.v1f64")]
-        fn vmin_f64_(a: float64x1_t, a: float64x1_t) -> float64x1_t;
+        fn vmin_f64_(a: float64x1_t, b: float64x1_t) -> float64x1_t;
     }
     vmin_f64_(a, b)
 }
@@ -1126,11 +1126,69 @@ pub unsafe fn vminq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
         #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmin.v2f64")]
-        fn vminq_f64_(a: float64x2_t, a: float64x2_t) -> float64x2_t;
+        fn vminq_f64_(a: float64x2_t, b: float64x2_t) -> float64x2_t;
     }
     vminq_f64_(a, b)
 }
 
+/// Calculates the square root of each lane.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsqrt))]
+pub unsafe fn vsqrt_f32(a: float32x2_t) -> float32x2_t {
+    simd_fsqrt(a)
+}
+
+/// Calculates the square root of each lane.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsqrt))]
+pub unsafe fn vsqrtq_f32(a: float32x4_t) -> float32x4_t {
+    simd_fsqrt(a)
+}
+
+/// Calculates the square root of each lane.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsqrt))]
+pub unsafe fn vsqrt_f64(a: float64x1_t) -> float64x1_t {
+    simd_fsqrt(a)
+}
+
+/// Calculates the square root of each lane.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fsqrt))]
+pub unsafe fn vsqrtq_f64(a: float64x2_t) -> float64x2_t {
+    simd_fsqrt(a)
+}
+
+/// Reciprocal square-root estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrte))]
+pub unsafe fn vrsqrte_f64(a: float64x1_t) -> float64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v1f64")]
+        fn vrsqrte_f64_(a: float64x1_t) -> float64x1_t;
+    }
+    vrsqrte_f64_(a)
+}
+
+/// Reciprocal square-root estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(frsqrte))]
+pub unsafe fn vrsqrteq_f64(a: float64x2_t) -> float64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f64")]
+        fn vrsqrteq_f64_(a: float64x2_t) -> float64x2_t;
+    }
+    vrsqrteq_f64_(a)
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
@@ -2233,4 +2291,52 @@ mod test {
         let r: f64x2 = transmute(vminq_f64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqrt_f32() {
+        let a: f32x2 = f32x2::new(4.0, 9.0);
+        let e: f32x2 = f32x2::new(2.0, 3.0);
+        let r: f32x2 = transmute(vsqrt_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqrtq_f32() {
+        let a: f32x4 = f32x4::new(4.0, 9.0, 16.0, 25.0);
+        let e: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
+        let r: f32x4 = transmute(vsqrtq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqrt_f64() {
+        let a: f64 = 4.0;
+        let e: f64 = 2.0;
+        let r: f64 = transmute(vsqrt_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vsqrtq_f64() {
+        let a: f64x2 = f64x2::new(4.0, 9.0);
+        let e: f64x2 = f64x2::new(2.0, 3.0);
+        let r: f64x2 = transmute(vsqrtq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrte_f64() {
+        let a: f64 = 1.0;
+        let e: f64 = 0.998046875;
+        let r: f64 = transmute(vrsqrte_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrteq_f64() {
+        let a: f64x2 = f64x2::new(1.0, 2.0);
+        let e: f64x2 = f64x2::new(0.998046875, 0.705078125);
+        let r: f64x2 = transmute(vrsqrteq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
 }
diff --git a/crates/core_arch/src/arm/neon/generated.rs b/crates/core_arch/src/arm/neon/generated.rs
@@ -3349,6 +3349,38 @@ pub unsafe fn vminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
 vminq_f32_(a, b)
 }
 
+/// Reciprocal square-root estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrte))]
+pub unsafe fn vrsqrte_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32")]
+        fn vrsqrte_f32_(a: float32x2_t) -> float32x2_t;
+    }
+vrsqrte_f32_(a)
+}
+
+/// Reciprocal square-root estimate.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrte))]
+pub unsafe fn vrsqrteq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v4f32")]
+        fn vrsqrteq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+vrsqrteq_f32_(a)
+}
+
 #[cfg(test)]
 #[allow(overflowing_literals)]
 mod test {
@@ -5964,4 +5996,20 @@ mod test {
         let r: f32x4 = transmute(vminq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrte_f32() {
+        let a: f32x2 = f32x2::new(1.0, 2.0);
+        let e: f32x2 = f32x2::new(0.998046875, 0.705078125);
+        let r: f32x2 = transmute(vrsqrte_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vrsqrteq_f32() {
+        let a: f32x4 = f32x4::new(1.0, 2.0, 3.0, 4.0);
+        let e: f32x4 = f32x4::new(0.998046875, 0.705078125, 0.576171875, 0.4990234375);
+        let r: f32x4 = transmute(vrsqrteq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
 }
diff --git a/crates/core_arch/src/arm/neon/mod.rs b/crates/core_arch/src/arm/neon/mod.rs
@@ -136,10 +136,6 @@ extern "C" {
     #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.abs.v4i32")]
     fn vabsq_s32_(a: int32x4_t) -> int32x4_t;
 
-    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v2f32")]
-    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32")]
-    fn frsqrte_v2f32(a: float32x2_t) -> float32x2_t;
-
     //uint32x2_t vqmovn_u64 (uint64x2_t a)
     #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqmovnu.v2i32")]
     #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqxtn.v2i32")]
@@ -2466,15 +2462,6 @@ pub unsafe fn vmovl_u32(a: uint32x2_t) -> uint64x2_t {
     simd_cast(a)
 }
 
-/// Reciprocal square-root estimate.
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrte))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
-pub unsafe fn vrsqrte_f32(a: float32x2_t) -> float32x2_t {
-    frsqrte_v2f32(a)
-}
-
 /// Vector bitwise not.
 #[inline]
 #[target_feature(enable = "neon")]
@@ -7906,14 +7893,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vrsqrt_f32() {
-        let a = f32x2::new(1.0, 2.0);
-        let e = f32x2::new(0.9980469, 0.7050781);
-        let r: f32x2 = transmute(vrsqrte_f32(transmute(a)));
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "neon")]
     unsafe fn test_vpmin_s8() {
         let a = i8x8::new(1, -2, 3, -4, 5, 6, 7, 8);
diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec
@@ -720,3 +720,25 @@ aarch64 = fmin
 link-arm = vmins._EXT_
 link-aarch64 = fmin._EXT_
 generate float*_t
+
+/// Calculates the square root of each lane.
+name = vsqrt
+fn = simd_fsqrt
+a = 4.0, 9.0, 16.0, 25.0
+validate 2.0, 3.0, 4.0, 5.0
+
+aarch64 = fsqrt
+generate float*_t, float64x*_t
+
+/// Reciprocal square-root estimate.
+name = vrsqrte
+a = 1.0, 2.0, 3.0, 4.0
+validate 0.998046875, 0.705078125, 0.576171875, 0.4990234375
+
+aarch64 = frsqrte
+link-aarch64 = frsqrte._EXT_
+generate float64x*_t
+
+arm = vrsqrte
+link-arm = vrsqrte._EXT_
+generate float*_t
diff --git a/crates/stdarch-gen/src/main.rs b/crates/stdarch-gen/src/main.rs
@@ -345,13 +345,20 @@ fn gen_aarch64(
             r#"#[allow(improper_ctypes)]
     extern "C" {{
         #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.{}")]
-        fn {}(a: {}, a: {}) -> {};
+        fn {}({}) -> {};
     }}
     "#,
             link_aarch64.replace("_EXT_", ext),
             current_fn,
-            in_t,
-            in_t,
+            match para_num {
+                1 => {
+                    format!("a: {}", in_t)
+                }
+                2 => {
+                    format!("a: {}, b: {}", in_t, in_t)
+                }
+                _ => unimplemented!("unknown para_num"),
+            },
             out_t
         )
     } else {
@@ -527,7 +534,7 @@ fn gen_arm(
         }
         String::new()
     } else {
-        if link_aarch64.is_none() || link_arm.is_none() {
+        if link_aarch64.is_none() && link_arm.is_none() {
             panic!(
                 "[{}] Either fn or link-arm and link-aarch have to be specified.",
                 name
@@ -544,14 +551,21 @@ fn gen_arm(
     extern "C" {{
         #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.{}")]
         #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.{}")]
-        fn {}(a: {}, b: {}) -> {};
+        fn {}({}) -> {};
     }}
 "#,
                 link_arm.replace("_EXT_", ext),
                 link_aarch64.replace("_EXT_", ext),
                 current_fn,
-                in_t,
-                in_t,
+                match para_num {
+                    1 => {
+                        format!("a: {}", in_t)
+                    }
+                    2 => {
+                        format!("a: {}, b: {}", in_t, in_t)
+                    }
+                    _ => unimplemented!("unknown para_num"),
+                },
                 out_t
             )
         } else {