Tencent · nihui · Oct 10, 2022 · Oct 9, 2022 · Oct 9, 2022 · Oct 9, 2022
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -171,7 +171,7 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm")
         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)
 
         set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+bf16")
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vbfmmlaq_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16)
+        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16)
 
         set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+i8mm")
         check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)

diff --git a/src/layer/arm/cast_bf16.h b/src/layer/arm/cast_bf16.h
@@ -150,7 +150,7 @@ static void cast_fp32_to_bf16_neon(const Mat& bottom_blob, Mat& top_blob, const
 
 static void cast_bf16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
 {
-#if NCNN_ARM84BF16 && __aarch64__ && !__ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#if NCNN_RUNTIME_CPU && NCNN_ARM84BF16 && __aarch64__ && !__ARM_FEATURE_BF16_VECTOR_ARITHMETIC
     if (ncnn::cpu_support_arm_bf16())
     {
         cast_bf16_to_fp32_neon_bf16(bottom_blob, top_blob, opt);

diff --git a/src/layer/arm/cast_fp16.h b/src/layer/arm/cast_fp16.h
@@ -47,12 +47,12 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const
         {
 #if __aarch64__
             asm volatile(
-                "prfm   pldl1keep, [%0, #512]   \n"
+                "prfm   pldl1keep, [%0, #512]       \n"
                 "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
-                "fcvtn  v0.4h, v0.4s            \n"
-                "fcvtn  v1.4h, v1.4s            \n"
-                "fcvtn  v2.4h, v2.4s            \n"
-                "fcvtn  v3.4h, v3.4s            \n"
+                "fcvtn  v0.4h, v0.4s                \n"
+                "fcvtn  v1.4h, v1.4s                \n"
+                "fcvtn  v2.4h, v2.4s                \n"
+                "fcvtn  v3.4h, v3.4s                \n"
                 "st1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n"
                 : "=r"(ptr),   // %0
                 "=r"(outptr) // %1
@@ -61,12 +61,12 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const
                 : "memory", "v0", "v1", "v2", "v3");
 #else  // __aarch64__
             asm volatile(
-                "pld        [%0, #512]      \n"
-                "vldm       %0!, {d0-d7}    \n"
-                "vcvt.f16.f32 d0, q0        \n"
-                "vcvt.f16.f32 d1, q1        \n"
-                "vcvt.f16.f32 d2, q2        \n"
-                "vcvt.f16.f32 d3, q3        \n"
+                "pld        [%0, #512]          \n"
+                "vldm       %0!, {d0-d7}        \n"
+                "vcvt.f16.f32 d0, q0            \n"
+                "vcvt.f16.f32 d1, q1            \n"
+                "vcvt.f16.f32 d2, q2            \n"
+                "vcvt.f16.f32 d3, q3            \n"
                 "vst1.u16   {d0-d3}, [%1 :128]! \n"
                 : "=r"(ptr),   // %0
                 "=r"(outptr) // %1
@@ -77,24 +77,61 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const
         }
         for (; i + 7 < size; i += 8)
         {
-            float32x4_t _p0_fp32 = vld1q_f32(ptr);
-            float32x4_t _p1_fp32 = vld1q_f32(ptr + 4);
-            float16x4_t _p0_fp16 = vcvt_f16_f32(_p0_fp32);
-            float16x4_t _p1_fp16 = vcvt_f16_f32(_p1_fp32);
-            uint16x8_t _p_fp16 = vcombine_u16(vreinterpret_u16_f16(_p0_fp16), vreinterpret_u16_f16(_p1_fp16));
-            vst1q_u16(outptr, _p_fp16);
-            ptr += 8;
-            outptr += 8;
+            // This is originally implemented with neon fp16 intrinsics.
+            // In the new version of gcc, __ARM_FP16_FORMAT_IEEE or __ARM_FP16_FORMAT_ALTERNATIVE needs to be defined to use the float16x4_t type.
+            // That leads to compiler error when compiled with -mfpu=neon-vfpv4 but without -mfp16-format=ieee flag.
+            // We could add more macro conditions to differentiate between old and new versions, but that's pretty ugly!
+            // Just use all inline assembly here ~
+            //          --- nihui
+#if __aarch64__
+            asm volatile(
+                "ld1    {v0.4s, v1.4s}, [%0], #32   \n"
+                "fcvtn  v0.4h, v0.4s                \n"
+                "fcvtn  v1.4h, v1.4s                \n"
+                "st1    {v0.4h, v1.4h}, [%1], #16   \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "v0", "v1");
+#else  // __aarch64__
+            asm volatile(
+                "vld1.f32   {d0-d3}, [%0]!  \n"
+                "vcvt.f16.f32 d0, q0        \n"
+                "vcvt.f16.f32 d1, q1        \n"
+                "vst1.u16   {d0-d1}, [%1]!  \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "q0", "q1");
+#endif // __aarch64__
         }
         for (; i + 3 < size; i += 4)
         {
-            float32x4_t _p_fp32 = vld1q_f32(ptr);
-            float16x4_t _p_fp16 = vcvt_f16_f32(_p_fp32);
-            vst1_u16(outptr, vreinterpret_u16_f16(_p_fp16));
-            ptr += 4;
-            outptr += 4;
+#if __aarch64__
+            asm volatile(
+                "ld1    {v0.4s}, [%0], #16  \n"
+                "fcvtn  v0.4h, v0.4s        \n"
+                "st1    {v0.4h}, [%1], #8   \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "v0");
+#else  // __aarch64__
+            asm volatile(
+                "vld1.f32   {d0-d1}, [%0]!  \n"
+                "vcvt.f16.f32 d0, q0        \n"
+                "vst1.u16   {d0}, [%1]!     \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "q0");
+#endif // __aarch64__
         }
-#endif
+#endif // (__ARM_FP & 2)
         for (; i < size; i++)
         {
             *outptr++ = float32_to_float16(*ptr++);
@@ -104,7 +141,7 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const
 
 static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
 {
-#if NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2)
+#if NCNN_RUNTIME_CPU && NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2)
     if (ncnn::cpu_support_arm_vfpv4())
     {
         cast_fp16_to_fp32_neon_vfpv4(bottom_blob, top_blob, opt);
@@ -132,12 +169,12 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const
         {
 #if __aarch64__
             asm volatile(
-                "prfm   pldl1keep, [%0, #256]   \n"
+                "prfm   pldl1keep, [%0, #256]       \n"
                 "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%0], #32 \n"
-                "fcvtl  v0.4s, v0.4h            \n"
-                "fcvtl  v1.4s, v1.4h            \n"
-                "fcvtl  v2.4s, v2.4h            \n"
-                "fcvtl  v3.4s, v3.4h            \n"
+                "fcvtl  v0.4s, v0.4h                \n"
+                "fcvtl  v1.4s, v1.4h                \n"
+                "fcvtl  v2.4s, v2.4h                \n"
+                "fcvtl  v3.4s, v3.4h                \n"
                 "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
                 : "=r"(ptr),   // %0
                 "=r"(outptr) // %1
@@ -146,13 +183,13 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const
                 : "memory", "v0", "v1", "v2", "v3");
 #else  // __aarch64__
             asm volatile(
-                "pld        [%0, #256]      \n"
+                "pld        [%0, #256]          \n"
                 "vld1.u16   {d4-d7}, [%0 :128]! \n"
-                "vcvt.f32.f16 q0, d4        \n"
-                "vcvt.f32.f16 q1, d5        \n"
-                "vcvt.f32.f16 q2, d6        \n"
-                "vcvt.f32.f16 q3, d7        \n"
-                "vstm       %1!, {d0-d7}    \n"
+                "vcvt.f32.f16 q0, d4            \n"
+                "vcvt.f32.f16 q1, d5            \n"
+                "vcvt.f32.f16 q2, d6            \n"
+                "vcvt.f32.f16 q3, d7            \n"
+                "vstm       %1!, {d0-d7}        \n"
                 : "=r"(ptr),   // %0
                 "=r"(outptr) // %1
                 : "0"(ptr),
@@ -162,25 +199,55 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const
         }
         for (; i + 7 < size; i += 8)
         {
-            uint16x8_t _p_fp16 = vld1q_u16(ptr);
-            float16x4_t _p0_fp16 = vreinterpret_f16_u16(vget_low_u16(_p_fp16));
-            float16x4_t _p1_fp16 = vreinterpret_f16_u16(vget_high_u16(_p_fp16));
-            float32x4_t _p0_fp32 = vcvt_f32_f16(_p0_fp16);
-            float32x4_t _p1_fp32 = vcvt_f32_f16(_p1_fp16);
-            vst1q_f32(outptr, _p0_fp32);
-            vst1q_f32(outptr + 4, _p1_fp32);
-            ptr += 8;
-            outptr += 8;
+#if __aarch64__
+            asm volatile(
+                "ld1    {v0.4h, v1.4h}, [%0], #16   \n"
+                "fcvtl  v0.4s, v0.4h                \n"
+                "fcvtl  v1.4s, v1.4h                \n"
+                "st1    {v0.4s, v1.4s}, [%1], #32   \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "v0", "v1");
+#else  // __aarch64__
+            asm volatile(
+                "vld1.u16   {d4-d5}, [%0]!  \n"
+                "vcvt.f32.f16 q0, d4        \n"
+                "vcvt.f32.f16 q1, d5        \n"
+                "vst1.f32   {d0-d3}, [%1]!  \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "q0", "q1", "q2");
+#endif // __aarch64__
         }
         for (; i + 3 < size; i += 4)
         {
-            float16x4_t _p_fp16 = vreinterpret_f16_u16(vld1_u16(ptr));
-            float32x4_t _p_fp32 = vcvt_f32_f16(_p_fp16);
-            vst1q_f32(outptr, _p_fp32);
-            ptr += 4;
-            outptr += 4;
+#if __aarch64__
+            asm volatile(
+                "ld1    {v0.4h}, [%0], #8   \n"
+                "fcvtl  v0.4s, v0.4h        \n"
+                "st1    {v0.4s}, [%1], #16  \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "v0");
+#else  // __aarch64__
+            asm volatile(
+                "vld1.u16   {d2}, [%0]!     \n"
+                "vcvt.f32.f16 q0, d2        \n"
+                "vst1.f32   {d0-d1}, [%1]!  \n"
+                : "=r"(ptr),   // %0
+                "=r"(outptr) // %1
+                : "0"(ptr),
+                "1"(outptr)
+                : "memory", "q0", "q1");
+#endif // __aarch64__
         }
-#endif
+#endif // (__ARM_FP & 2)
         for (; i < size; i++)
         {
             *outptr++ = float16_to_float32(*ptr++);

diff --git a/src/layer/arm/innerproduct_fp16s.h b/src/layer/arm/innerproduct_fp16s.h
@@ -253,10 +253,10 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
             float32x4_t _val = vld1q_f32(sptr);
             uint16x8_t _w01 = vld1q_u16(kptr);
             uint16x8_t _w23 = vld1q_u16(kptr + 8);
-            float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w01)));
-            float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w01)));
-            float32x4_t _w2 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w23)));
-            float32x4_t _w3 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w23)));
+            float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01)));
+            float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01)));
+            float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w23)));
+            float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w23)));
 #endif
 
 #if __aarch64__
@@ -281,7 +281,7 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
 #else
-            float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+            float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
             _sum0 = vfmaq_f32(_sum0, _val, _w);
 
@@ -410,10 +410,10 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const
             float32x4_t _w3 = vcvt_f32_f16(vld1_f16(kptr3));
 #else
             float32x4_t _val = vld1q_f32(sptr);
-            float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr0)));
-            float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr1)));
-            float32x4_t _w2 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr2)));
-            float32x4_t _w3 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr3)));
+            float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr0)));
+            float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr1)));
+            float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr2)));
+            float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr3)));
 #endif
 
             _sum0 = vfmaq_f32(_sum0, _val, _w0);
@@ -507,7 +507,7 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const
             float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
 #else
             float32x4_t _val = vld1q_f32(sptr);
-            float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+            float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
             _sum = vfmaq_f32(_sum, _val, _w);
 
@@ -713,10 +713,10 @@ static void innerproduct_transform_kernel_fp16s_neon(const Mat& weight_data, Mat
             {
                 // transpose 4x4
                 uint16x4x4_t _p;
-                _p.val[0] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k0)));
-                _p.val[1] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k1)));
-                _p.val[2] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k2)));
-                _p.val[3] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k3)));
+                _p.val[0] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k0)));
+                _p.val[1] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k1)));
+                _p.val[2] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k2)));
+                _p.val[3] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k3)));
                 vst4_u16(g0, _p);
 
                 k0 += 4;

diff --git a/src/layer/arm/innerproduct_gemm_fp16s.h b/src/layer/arm/innerproduct_gemm_fp16s.h
@@ -120,7 +120,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
                     float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
 #else
                     float32x4_t _val = vld1q_f32(m);
-                    float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+                    float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
 
 #if __aarch64__
@@ -214,10 +214,10 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
                     float32x4_t _val = vld1q_f32(m);
                     uint16x8_t _w01 = vld1q_u16(kptr);
                     uint16x8_t _w23 = vld1q_u16(kptr + 8);
-                    float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w01)));
-                    float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w01)));
-                    float32x4_t _w2 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w23)));
-                    float32x4_t _w3 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w23)));
+                    float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01)));
+                    float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01)));
+                    float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w23)));
+                    float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w23)));
 #endif
 
 #if __aarch64__
@@ -242,7 +242,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                     float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
 #else
-                    float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+                    float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
                     _sum0 = vfmaq_f32(_sum0, _val, _w);
 
@@ -317,7 +317,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
                     float32x4_t _val1 = vld1q_f32(m + 4);
                     float32x4_t _val2 = vld1q_f32(m + 8);
                     float32x4_t _val3 = vld1q_f32(m + 12);
-                    float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+                    float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
 
 #if __aarch64__
@@ -414,8 +414,8 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
                     float32x4_t _val0 = vld1q_f32(m);
                     float32x4_t _val1 = vld1q_f32(m + 4);
                     uint16x8_t _w01 = vld1q_u16(kptr);
-                    float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w01)));
-                    float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w01)));
+                    float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01)));
+                    float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01)));
 #endif
 
                     _sum0 = vfmaq_f32(_sum0, _val0, _w0);
@@ -433,7 +433,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob,
                     float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
 #else
                     float32x4_t _val = vld1q_f32(m);
-                    float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr)));
+                    float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
 #endif
 
                     _sum0 = vfmaq_f32(_sum0, _val, _w);