From 242dd3cad870622cdaf556da8088673a7a33e791 Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Thu, 3 Aug 2023 22:59:38 +0800
Subject: [PATCH 01/23] Add draft code for op remainder on x86

---
 src/layer/x86/remainder_x86.cpp | 159 ++++++++++++++++++++++++++++++++
 src/layer/x86/remainder_x86.h   |  18 ++++
 2 files changed, 177 insertions(+)
 create mode 100644 src/layer/x86/remainder_x86.cpp
 create mode 100644 src/layer/x86/remainder_x86.h
diff --git a/src/layer/x86/remainder_x86.cpp b/src/layer/x86/remainder_x86.cpp
new file mode 100644
index 00000000000..6cd5cb08be0
--- /dev/null
+++ b/src/layer/x86/remainder_x86.cpp
@@ -0,0 +1,159 @@
+#include "remainder_x86.h"
+
+#if __SSE2__
+#include <emmintrin.h>
+#if __AVX__
+#include <immintrin.h>
+#endif // __AVX__
+#endif // __SSE2__
+#include "x86_usability.h"
+
+namespace ncnn {
+
+Remainder_x86::Remainder_x86()
+{
+#if __SSE2__
+    support_packing = true;
+#endif // __SSE2__
+}
+
+int Remainder_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    int elempack = bottom_blob.elempack;
+    int size = w * h * d * elempack;
+
+    Mat& top_blob = top_blobs[0];
+    top_blob.create_like(bottom_blob, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    // first blob
+    const Mat& bottom_blob1 = bottom_blobs[1];
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+        const float* ptr1 = bottom_blob1.channel(q);
+        float* outptr = top_blob.channel(q);
+
+        int i = 0;
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+        for (; i + 15 < size; i += 16)
+        {
+            __m512 _p = _mm512_loadu_ps(ptr);
+            __m512 _p1 = _mm512_loadu_ps(ptr1);
+            // TODO: Instruction for remainder
+            // _p = xxxxx(_p, _p1);
+            // _mm512_storeu_ps(outptr, _p);
+
+            ptr += 16;
+            ptr1 += 16;
+            outptr += 16;
+        }
+#endif // __AVX512F__
+        for (; i + 7 < size; i += 8)
+        {
+            __m256 _p = _mm256_loadu_ps(ptr);
+            __m256 _p1 = _mm256_loadu_ps(ptr1);
+            // TODO: Instruction for remainder
+            // _p = xxxxx(_p, _p1);
+            // _mm256_storeu_ps(outptr, _p);
+
+            ptr += 8;
+            ptr1 += 8;
+            outptr += 8;
+        }
+#endif // __AVX__
+        for (; i + 3 < size; i += 4)
+        {
+            __m128 _p = _mm_load_ps(ptr);
+            __m128 _p1 = _mm_load_ps(ptr1);
+            // TODO: Instruction for remainder
+            // _p = xxxxx(_p, _p1);
+            // _mm_store_ps(outptr, _p);
+
+            ptr += 4;
+            ptr1 += 4;
+            outptr += 4;
+        }
+#endif // __SSE2__
+        for (; i < size; i++)
+        {
+            *outptr = remainderf(*ptr, *ptr1);
+
+            ptr++;
+            ptr1++;
+            outptr++;
+        }
+    }
+
+    for (size_t b = 2; b < bottom_blobs.size(); b++)
+    {
+        const Mat& bottom_blob2 = bottom_blobs[b];
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob2.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            int i = 0;
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+            for (; i + 15 < size; i += 16)
+            {
+                __m512 _p = _mm512_loadu_ps(outptr);
+                __m512 _p1 = _mm512_loadu_ps(ptr);
+                // TODO: Instruction for remainder
+                // _p = xxxxx(_p, _p1);
+                // _mm512_storeu_ps(outptr, _p);
+
+                ptr += 16;
+                outptr += 16;
+            }
+#endif // __AVX512F__
+            for (; i + 7 < size; i += 8)
+            {
+                __m256 _p = _mm256_loadu_ps(outptr);
+                __m256 _p1 = _mm256_loadu_ps(ptr);
+                // TODO: Instruction for remainder
+                // _p = xxxxx(_p, _p1);
+                // _mm256_storeu_ps(outptr, _p);
+
+                ptr += 8;
+                outptr += 8;
+            }
+#endif // __AVX__
+            for (; i + 3 < size; i += 4)
+            {
+                __m128 _p = _mm_load_ps(outptr);
+                __m128 _p1 = _mm_load_ps(ptr);
+                // TODO: Instruction for remainder
+                // _p = xxxxx(_p, _p1);
+                // _mm_store_ps(outptr, _p);
+
+                ptr += 4;
+                outptr += 4;
+            }
+#endif // __SSE2__
+            for (; i < size; i++)
+            {
+                *outptr = remainderf(*outptr, *ptr);
+
+                ptr++;
+                outptr++;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/x86/remainder_x86.h b/src/layer/x86/remainder_x86.h
new file mode 100644
index 00000000000..2a260eccf4f
--- /dev/null
+++ b/src/layer/x86/remainder_x86.h
@@ -0,0 +1,18 @@
+#ifndef LAYER_ELTWISE_X86_H
+#define LAYER_ELTWISE_X86_H
+
+#include "eltwise.h"
+
+namespace ncnn {
+
+class Remainder_x86 : virtual public Eltwise
+{
+public:
+    Remainder_x86();
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ELTWISE_X86_H

From 3d77066d86169926952cad11868f099cf5879979 Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Fri, 4 Aug 2023 23:10:50 +0800
Subject: [PATCH 02/23] Remove old remainder_x86

---
 src/layer/x86/remainder_x86.cpp | 159 --------------------------------
 src/layer/x86/remainder_x86.h   |  18 ----
 2 files changed, 177 deletions(-)
 delete mode 100644 src/layer/x86/remainder_x86.cpp
 delete mode 100644 src/layer/x86/remainder_x86.h

diff --git a/src/layer/x86/remainder_x86.cpp b/src/layer/x86/remainder_x86.cpp
deleted file mode 100644
index 6cd5cb08be0..00000000000
--- a/src/layer/x86/remainder_x86.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-#include "remainder_x86.h"
-
-#if __SSE2__
-#include <emmintrin.h>
-#if __AVX__
-#include <immintrin.h>
-#endif // __AVX__
-#endif // __SSE2__
-#include "x86_usability.h"
-
-namespace ncnn {
-
-Remainder_x86::Remainder_x86()
-{
-#if __SSE2__
-    support_packing = true;
-#endif // __SSE2__
-}
-
-int Remainder_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
-{
-    const Mat& bottom_blob = bottom_blobs[0];
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int d = bottom_blob.d;
-    int channels = bottom_blob.c;
-    int elempack = bottom_blob.elempack;
-    int size = w * h * d * elempack;
-
-    Mat& top_blob = top_blobs[0];
-    top_blob.create_like(bottom_blob, opt.blob_allocator);
-    if (top_blob.empty())
-        return -100;
-
-    // first blob
-    const Mat& bottom_blob1 = bottom_blobs[1];
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < channels; q++)
-    {
-        const float* ptr = bottom_blob.channel(q);
-        const float* ptr1 = bottom_blob1.channel(q);
-        float* outptr = top_blob.channel(q);
-
-        int i = 0;
-#if __SSE2__
-#if __AVX__
-#if __AVX512F__
-        for (; i + 15 < size; i += 16)
-        {
-            __m512 _p = _mm512_loadu_ps(ptr);
-            __m512 _p1 = _mm512_loadu_ps(ptr1);
-            // TODO: Instruction for remainder
-            // _p = xxxxx(_p, _p1);
-            // _mm512_storeu_ps(outptr, _p);
-
-            ptr += 16;
-            ptr1 += 16;
-            outptr += 16;
-        }
-#endif // __AVX512F__
-        for (; i + 7 < size; i += 8)
-        {
-            __m256 _p = _mm256_loadu_ps(ptr);
-            __m256 _p1 = _mm256_loadu_ps(ptr1);
-            // TODO: Instruction for remainder
-            // _p = xxxxx(_p, _p1);
-            // _mm256_storeu_ps(outptr, _p);
-
-            ptr += 8;
-            ptr1 += 8;
-            outptr += 8;
-        }
-#endif // __AVX__
-        for (; i + 3 < size; i += 4)
-        {
-            __m128 _p = _mm_load_ps(ptr);
-            __m128 _p1 = _mm_load_ps(ptr1);
-            // TODO: Instruction for remainder
-            // _p = xxxxx(_p, _p1);
-            // _mm_store_ps(outptr, _p);
-
-            ptr += 4;
-            ptr1 += 4;
-            outptr += 4;
-        }
-#endif // __SSE2__
-        for (; i < size; i++)
-        {
-            *outptr = remainderf(*ptr, *ptr1);
-
-            ptr++;
-            ptr1++;
-            outptr++;
-        }
-    }
-
-    for (size_t b = 2; b < bottom_blobs.size(); b++)
-    {
-        const Mat& bottom_blob2 = bottom_blobs[b];
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
-        {
-            const float* ptr = bottom_blob2.channel(q);
-            float* outptr = top_blob.channel(q);
-
-            int i = 0;
-#if __SSE2__
-#if __AVX__
-#if __AVX512F__
-            for (; i + 15 < size; i += 16)
-            {
-                __m512 _p = _mm512_loadu_ps(outptr);
-                __m512 _p1 = _mm512_loadu_ps(ptr);
-                // TODO: Instruction for remainder
-                // _p = xxxxx(_p, _p1);
-                // _mm512_storeu_ps(outptr, _p);
-
-                ptr += 16;
-                outptr += 16;
-            }
-#endif // __AVX512F__
-            for (; i + 7 < size; i += 8)
-            {
-                __m256 _p = _mm256_loadu_ps(outptr);
-                __m256 _p1 = _mm256_loadu_ps(ptr);
-                // TODO: Instruction for remainder
-                // _p = xxxxx(_p, _p1);
-                // _mm256_storeu_ps(outptr, _p);
-
-                ptr += 8;
-                outptr += 8;
-            }
-#endif // __AVX__
-            for (; i + 3 < size; i += 4)
-            {
-                __m128 _p = _mm_load_ps(outptr);
-                __m128 _p1 = _mm_load_ps(ptr);
-                // TODO: Instruction for remainder
-                // _p = xxxxx(_p, _p1);
-                // _mm_store_ps(outptr, _p);
-
-                ptr += 4;
-                outptr += 4;
-            }
-#endif // __SSE2__
-            for (; i < size; i++)
-            {
-                *outptr = remainderf(*outptr, *ptr);
-
-                ptr++;
-                outptr++;
-            }
-        }
-    }
-
-    return 0;
-}
-
-} // namespace ncnn
diff --git a/src/layer/x86/remainder_x86.h b/src/layer/x86/remainder_x86.h
deleted file mode 100644
index 2a260eccf4f..00000000000
--- a/src/layer/x86/remainder_x86.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef LAYER_ELTWISE_X86_H
-#define LAYER_ELTWISE_X86_H
-
-#include "eltwise.h"
-
-namespace ncnn {
-
-class Remainder_x86 : virtual public Eltwise
-{
-public:
-    Remainder_x86();
-
-    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
-};
-
-} // namespace ncnn
-
-#endif // LAYER_ELTWISE_X86_H

From 173d199f2a385774242b225fefc6a0253d72a4b2 Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Fri, 4 Aug 2023 23:11:10 +0800
Subject: [PATCH 03/23] Refactor remainder_x86 to binary op

---
 src/layer/binaryop.cpp         | 10 ++++++++++
 src/layer/binaryop.h           |  3 ++-
 src/layer/x86/avx512_mathfun.h |  7 +++++++
 src/layer/x86/avx_mathfun.h    |  8 ++++++++
 src/layer/x86/binaryop_x86.cpp | 27 +++++++++++++++++++++++++++
 src/layer/x86/sse_mathfun.h    | 10 ++++++++++
 6 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/src/layer/binaryop.cpp b/src/layer/binaryop.cpp
index 442d43b500c..80d0588c759 100644
--- a/src/layer/binaryop.cpp
+++ b/src/layer/binaryop.cpp
@@ -239,6 +239,14 @@ struct binary_op_ratan2
     }
 };
 
+struct binary_op_remainder
+{
+    float operator()(const float& x, const float& y) const
+    {
+        return (float)remainderf(x, y);
+    }
+};
+
 static void binary_op_broadcast(const Mat& a, const Mat& b, Mat& c, int op_type, const Option& opt)
 {
     if (op_type == BinaryOp::Operation_ADD) return binary_op_broadcast<binary_op_add>(a, b, c, opt);
@@ -253,6 +261,7 @@ static void binary_op_broadcast(const Mat& a, const Mat& b, Mat& c, int op_type,
     if (op_type == BinaryOp::Operation_RPOW) return binary_op_broadcast<binary_op_pow>(b, a, c, opt);
     if (op_type == BinaryOp::Operation_ATAN2) return binary_op_broadcast<binary_op_atan2>(a, b, c, opt);
     if (op_type == BinaryOp::Operation_RATAN2) return binary_op_broadcast<binary_op_atan2>(b, a, c, opt);
+    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_broadcast<binary_op_remainder>(b, a, c, opt);
 
     // should never reach here
 }
@@ -271,6 +280,7 @@ static void binary_op_scalar_inplace(Mat& bottom_top_blob, float b, int op_type,
     if (op_type == BinaryOp::Operation_RPOW) return binary_op_scalar_inplace<binary_op_rpow>(bottom_top_blob, b, opt);
     if (op_type == BinaryOp::Operation_ATAN2) return binary_op_scalar_inplace<binary_op_atan2>(bottom_top_blob, b, opt);
     if (op_type == BinaryOp::Operation_RATAN2) return binary_op_scalar_inplace<binary_op_ratan2>(bottom_top_blob, b, opt);
+    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_scalar_inplace<binary_op_remainder>(bottom_top_blob, b, opt);
 
     // should never reach here
 }
diff --git a/src/layer/binaryop.h b/src/layer/binaryop.h
index 5fc06918d20..f22d970be6c 100644
--- a/src/layer/binaryop.h
+++ b/src/layer/binaryop.h
@@ -45,7 +45,8 @@ class BinaryOp : public Layer
         Operation_RDIV = 8,
         Operation_RPOW = 9,
         Operation_ATAN2 = 10,
-        Operation_RATAN2 = 11
+        Operation_RATAN2 = 11,
+        Operation_REMAINDER = 12
     };
 
 public:
diff --git a/src/layer/x86/avx512_mathfun.h b/src/layer/x86/avx512_mathfun.h
index b5e47bdbe68..725643d9e2d 100644
--- a/src/layer/x86/avx512_mathfun.h
+++ b/src/layer/x86/avx512_mathfun.h
@@ -856,4 +856,11 @@ static NCNN_FORCEINLINE __m512 abs512_ps(__m512 x)
     return _mm512_andnot_ps(magic_negative_zero, x);
 }
 
+static NCNN_FORCEINLINE __m512 remainder512_ps(__m512 x, __m512 y)
+{
+    const __m512 round_div_result = _mm512_div_round_ps(x, y, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
+    const __m512 mul_result = _mm512_mul_ps(round_div_result, y);
+    return _mm512_sub_ps(x, mul_result);
+}
+
 #endif // AVX512_MATHFUN_H
diff --git a/src/layer/x86/avx_mathfun.h b/src/layer/x86/avx_mathfun.h
index 65c34efc23e..24788cdf7a1 100644
--- a/src/layer/x86/avx_mathfun.h
+++ b/src/layer/x86/avx_mathfun.h
@@ -1087,4 +1087,12 @@ static NCNN_FORCEINLINE __m256 abs256_ps(__m256 x)
     return _mm256_andnot_ps(magic_negative_zero, x);
 }
 
+static NCNN_FORCEINLINE __m256 remainder256_ps(__m256 x, __m256 y)
+{
+    const __m256 div_result = _mm256_div_ps(x, y);
+    const __m256 round_result = _mm256_round_ps(div_result, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
+    const __m256 mul_result = _mm256_mul_ps(round_result, y);
+    return _mm256_sub_ps(x, mul_result);
+}
+
 #endif // AVX_MATHFUN_H
diff --git a/src/layer/x86/binaryop_x86.cpp b/src/layer/x86/binaryop_x86.cpp
index 328b0484a3f..5123767b23a 100644
--- a/src/layer/x86/binaryop_x86.cpp
+++ b/src/layer/x86/binaryop_x86.cpp
@@ -791,6 +791,32 @@ struct binary_op_ratan2
 #endif // __SSE2__
 };
 
+struct binary_op_remainder
+{
+    float func(const float& x, const float& y) const
+    {
+        return (float)remainderf(x, y);
+    }
+#if __SSE2__
+    __m128 func_pack4(const __m128& x, const __m128& y) const
+    {
+        return remainder_ps(x, y);
+    }
+#if __AVX__
+    __m256 func_pack8(const __m256& x, const __m256& y) const
+    {
+        return remainder256_ps(x, y);
+    }
+#if __AVX512F__
+    __m512 func_pack16(const __m512& x, const __m512& y) const
+    {
+        return remainder512_ps(x, y);
+    }
+#endif // __AVX512F__
+#endif // __AVX__
+#endif // __SSE2__
+};
+
 } // namespace BinaryOp_x86_functor
 
 static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr, int aw, int bw, int ap, int bp, int op_type)
@@ -809,6 +835,7 @@ static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr,
     if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector<binary_op_rpow>(ptr, ptr1, outptr, aw, bw, ap, bp);
     if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector<binary_op_atan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
     if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector<binary_op_ratan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
+    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector<binary_op_remainder>(ptr, ptr1, outptr, aw, bw, ap, bp);
 
     // should never reach here
 }
diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h
index b7cecfb8123..72727e407d7 100644
--- a/src/layer/x86/sse_mathfun.h
+++ b/src/layer/x86/sse_mathfun.h
@@ -32,6 +32,8 @@
 #ifndef SSE_MATHFUN_H
 #define SSE_MATHFUN_H
 
+#include <immintrin.h>
+#include <smmintrin.h>
 #define USE_SSE2 1
 
 #include <xmmintrin.h>
@@ -1157,4 +1159,12 @@ static NCNN_FORCEINLINE __m128 abs_ps(__m128 inputs)
     return _mm_andnot_ps(magic_negative_zero, inputs);
 }
 
+static NCNN_FORCEINLINE __m128 remainder_ps(__m128 x, __m128 y)
+{
+    const __m128 div_result = _mm_div_ps(x, y);
+    const __m128 round_result = _mm_round_ps(div_result, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
+    const __m128 mul_result = _mm_mul_ps(round_result, y);
+    return _mm_sub_ps(x, mul_result);
+}
+
 #endif // SSE_MATHFUN_H

From 460b4d41277c006beb638e7a6927a4859397b511 Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Fri, 4 Aug 2023 23:13:31 +0800
Subject: [PATCH 04/23] Remove headers

---
 src/layer/x86/sse_mathfun.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h
index 72727e407d7..b9352fec6c4 100644
--- a/src/layer/x86/sse_mathfun.h
+++ b/src/layer/x86/sse_mathfun.h
@@ -32,8 +32,6 @@
 #ifndef SSE_MATHFUN_H
 #define SSE_MATHFUN_H
 
-#include <immintrin.h>
-#include <smmintrin.h>
 #define USE_SSE2 1
 
 #include <xmmintrin.h>

From 5ab6be7036028594896f6826f96adb24cf9d13dc Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Tue, 8 Aug 2023 22:16:40 +0800
Subject: [PATCH 05/23] Use sse4

---
 src/CMakeLists.txt          | 2 +-
 src/layer/x86/sse_mathfun.h | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cb7be1f6e9c..07496791a96 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -345,7 +345,7 @@ if(NCNN_TARGET_ARCH STREQUAL "x86")
         if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
             target_compile_options(ncnn PRIVATE /arch:SSE2 /D__SSE2__)
         else()
-            target_compile_options(ncnn PRIVATE -msse2 -msse)
+            target_compile_options(ncnn PRIVATE -msse4.1 -msse2 -msse)
             if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
                 target_compile_options(ncnn PRIVATE -msimd128)
             endif()
diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h
index b9352fec6c4..cc34d7f7989 100644
--- a/src/layer/x86/sse_mathfun.h
+++ b/src/layer/x86/sse_mathfun.h
@@ -33,6 +33,7 @@
 #define SSE_MATHFUN_H
 
 #define USE_SSE2 1
+#define USE_SSE4 1
 
 #include <xmmintrin.h>
 #include <x86_usability.h>
@@ -57,6 +58,10 @@ typedef __m128i v4si; // vector of 4 int (sse2)
 typedef __m64 v2si; // vector of 2 int (mmx)
 #endif
 
+#ifdef USE_SSE4
+#include <smmintrin.h>
+#endif
+
 /* declare some SSE constants -- why can't I figure a better way to do that? */
 #define _PS_CONST(Name, Val) \
     static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = {Val, Val, Val, Val}

From a8d4d97829f2f6979a6f6deba07288ffe3f6052a Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Tue, 8 Aug 2023 22:18:48 +0800
Subject: [PATCH 06/23] Try support remainder in vulkan

---
 src/layer/vulkan/shader/binaryop.comp                    | 1 +
 src/layer/vulkan/shader/binaryop_broadcast.comp          | 1 +
 src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp | 1 +
 src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp | 5 +++++
 src/layer/vulkan/shader/binaryop_broadcast_pack4.comp    | 1 +
 src/layer/vulkan/shader/binaryop_broadcast_pack8.comp    | 5 +++++
 src/layer/vulkan/shader/binaryop_pack4.comp              | 1 +
 src/layer/vulkan/shader/binaryop_pack8.comp              | 5 +++++
 8 files changed, 20 insertions(+)

diff --git a/src/layer/vulkan/shader/binaryop.comp b/src/layer/vulkan/shader/binaryop.comp
index 18f566a2a72..02c986817a3 100644
--- a/src/layer/vulkan/shader/binaryop.comp
+++ b/src/layer/vulkan/shader/binaryop.comp
@@ -137,6 +137,7 @@ void main()
     if (op_type == 10) res = atan(v1, v2);
     if (op_type == 11) res = atan(v2, v1);
 #endif
+    if (op_type == 12) res = mod(v1, v2);
 
 #if NCNN_image_shader
     image3d_st1(top_blob_3d, ivec3(gx, gy, gz), res);
diff --git a/src/layer/vulkan/shader/binaryop_broadcast.comp b/src/layer/vulkan/shader/binaryop_broadcast.comp
index 732e3f50b0a..d15d8e855ca 100644
--- a/src/layer/vulkan/shader/binaryop_broadcast.comp
+++ b/src/layer/vulkan/shader/binaryop_broadcast.comp
@@ -199,6 +199,7 @@ void main()
     if (op_type == 10) res = atan(v1, v2);
     if (op_type == 11) res = atan(v2, v1);
 #endif
+    if (op_type == 12) res = mod(v1, v2);
 
 #if NCNN_image_shader
     image3d_st1(top_blob_3d, ivec3(gx, gy, gz), res);
diff --git a/src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp b/src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp
index ced3933db4a..6279153b800 100644
--- a/src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp
+++ b/src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp
@@ -130,6 +130,7 @@ void main()
     if (op_type == 10) res = atan(v1, v2);
     if (op_type == 11) res = atan(v2, v1);
 #endif
+    if (op_type == 12) res = mod(v1, v2);
 
 #if NCNN_image_shader
     image3d_st4(top_blob_3d, ivec3(gx, gy, gz), res);
diff --git a/src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp b/src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp
index 963f9c0030c..9aa70946404 100644
--- a/src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp
+++ b/src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp
@@ -187,6 +187,11 @@ void main()
         res[1] = atan(v2[1], v1[1]);
 #endif
     }
+    if (op_type == 12)
+    {
+        res[0] = mod(v1[0], v2[0]);
+        res[1] = mod(v1[1], v2[1]);
+    }
 
 #if NCNN_image_shader
     image3d_st8(top_blob_3d, ivec3(gx, gy, gz), res);
diff --git a/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp b/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp
index a0f0376b09e..bcba406a748 100644
--- a/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp
+++ b/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp
@@ -199,6 +199,7 @@ void main()
     if (op_type == 10) res = atan(v1, v2);
     if (op_type == 11) res = atan(v2, v1);
 #endif
+    if (op_type == 12) res = mod(v1, v2);
 
 #if NCNN_image_shader
     image3d_st4(top_blob_3d, ivec3(gx, gy, gz), res);
diff --git a/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp b/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp
index b9e7d492bb9..f5cbd166b34 100644
--- a/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp
+++ b/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp
@@ -253,6 +253,11 @@ void main()
         res[1] = atan(v2[1], v1[1]);
 #endif
     }
+    if (op_type == 12)
+    {
+        res[0] = mod(v1[0], v2[0]);
+        res[1] = mod(v1[1], v2[1]);
+    }
 
 #if NCNN_image_shader
     image3d_st8(top_blob_3d, ivec3(gx, gy, gz), res);
diff --git a/src/layer/vulkan/shader/binaryop_pack4.comp b/src/layer/vulkan/shader/binaryop_pack4.comp
index 0189253fb3d..a1df2479aeb 100644
--- a/src/layer/vulkan/shader/binaryop_pack4.comp
+++ b/src/layer/vulkan/shader/binaryop_pack4.comp
@@ -128,6 +128,7 @@ void main()
     if (op_type == 10) res = atan(v1, v2);
     if (op_type == 11) res = atan(v2, v1);
 #endif
+    if (op_type == 12) res = mod(v1, v2);
 
 #if NCNN_image_shader
     image3d_st4(top_blob_3d, ivec3(gx, gy, gz), res);
diff --git a/src/layer/vulkan/shader/binaryop_pack8.comp b/src/layer/vulkan/shader/binaryop_pack8.comp
index 9fe54902bd5..924a01f2534 100644
--- a/src/layer/vulkan/shader/binaryop_pack8.comp
+++ b/src/layer/vulkan/shader/binaryop_pack8.comp
@@ -183,6 +183,11 @@ void main()
         res[1] = atan(v2[1], v1[1]);
 #endif
     }
+    if (op_type == 12)
+    {
+        res[0] = mod(v1[0], v2[0]);
+        res[1] = mod(v1[1], v2[1]);
+    }
 
 #if NCNN_image_shader
     image3d_st8(top_blob_3d, ivec3(gx, gy, gz), res);

From 184e4efcbaf100a83e43f5fb03eb20e6b4d151ff Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Tue, 8 Aug 2023 22:19:09 +0800
Subject: [PATCH 07/23] Try support remainder in riscv

---
 src/layer/riscv/binaryop_riscv.cpp  |  3 +++
 src/layer/riscv/rvv_mathfun.h       | 19 +++++++++++++++++++
 src/layer/riscv/rvv_mathfun_fp16s.h | 19 +++++++++++++++++++
 3 files changed, 41 insertions(+)

diff --git a/src/layer/riscv/binaryop_riscv.cpp b/src/layer/riscv/binaryop_riscv.cpp
index 67ff7ce99f1..31a6af6654d 100644
--- a/src/layer/riscv/binaryop_riscv.cpp
+++ b/src/layer/riscv/binaryop_riscv.cpp
@@ -295,6 +295,7 @@ MAKE_FUNCTION(binary_op_rdiv, y / x, vfdiv_vv_f32m8(y, x, vl), vfrdiv_vf_f32m8(x
 MAKE_FUNCTION(binary_op_rpow, (float)pow(y, x), pow_ps(y, x, vl), pow_ps(vfmv_v_f_f32m8(y, vl), x, vl), pow_ps(y, vfmv_v_f_f32m8(x, vl), vl))
 MAKE_FUNCTION(binary_op_atan2, (float)atan2(x, y), atan2_ps(x, y, vl), atan2_ps(x, vfmv_v_f_f32m8(y, vl), vl), atan2_ps(vfmv_v_f_f32m8(x, vl), y, vl))
 MAKE_FUNCTION(binary_op_ratan2, (float)atan2(y, x), atan2_ps(y, x, vl), atan2_ps(vfmv_v_f_f32m8(y, vl), x, vl), atan2_ps(y, vfmv_v_f_f32m8(x, vl), vl))
+MAKE_FUNCTION(binary_op_remainder, (float)remainderf(x, y), remainder_ps(x, y, vl), remainder_ps(x, vfmv_v_f_f32m8(y, vl), vl), remainder_ps(vfmv_v_f_f32m8(x, vl), y, vl))
 // *INDENT-ON*
 // clang-format on
 
@@ -318,6 +319,7 @@ static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr,
     if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector<binary_op_rpow>(ptr, ptr1, outptr, aw, bw, ap, bp);
     if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector<binary_op_atan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
     if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector<binary_op_ratan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
+    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector<binary_op_remainder>(ptr, ptr1, outptr, aw, bw, ap, bp);
 
     // should never reach here
 }
@@ -846,6 +848,7 @@ static void binary_op_vector_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16
     if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector_fp16s<binary_op_rpow_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
     if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector_fp16s<binary_op_atan2_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
     if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector_fp16s<binary_op_ratan2_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
+    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector_fp16s<binary_op_remainder_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
 
     // should never reach here
 }
diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h
index ebf980060a7..02b6a7417db 100644
--- a/src/layer/riscv/rvv_mathfun.h
+++ b/src/layer/riscv/rvv_mathfun.h
@@ -566,4 +566,23 @@ _RVV_FLOAT32_ATAN2_OP(2, 16)
 _RVV_FLOAT32_ATAN2_OP(4, 8)
 _RVV_FLOAT32_ATAN2_OP(8, 4)
 
+#define _RVV_FLOAT32_REMAINDER_OP(LMUL, MLEN)                                                               \
+    static inline vfloat32m##LMUL##_t remainder_ps(vfloat32m##LMUL##_t x, vfloat32m##LMUL##_t y, size_t vl) \
+    {                                                                                                       \
+        std::vector<float> tmpx(vl);                                                                        \
+        std::vector<float> tmpy(vl);                                                                        \
+        vse32_v_f32m##LMUL(tmpx.data(), x, vl);                                                             \
+        vse32_v_f32m##LMUL(tmpy.data(), y, vl);                                                             \
+        for (size_t i = 0; i < vl; i++)                                                                     \
+        {                                                                                                   \
+            tmpx[i] = remainderf(tmpx[i], tmpy[i]);                                                         \
+        }                                                                                                   \
+        return vle32_v_f32m##LMUL(tmpx.data(), vl);                                                         \
+    }
+
+_RVV_FLOAT32_REMAINDER_OP(1, 32)
+_RVV_FLOAT32_REMAINDER_OP(2, 16)
+_RVV_FLOAT32_REMAINDER_OP(4, 8)
+_RVV_FLOAT32_REMAINDER_OP(8, 4)
+
 #endif // RVV_MATHFUN_H
diff --git a/src/layer/riscv/rvv_mathfun_fp16s.h b/src/layer/riscv/rvv_mathfun_fp16s.h
index 47671fe21f0..5280ba584c9 100644
--- a/src/layer/riscv/rvv_mathfun_fp16s.h
+++ b/src/layer/riscv/rvv_mathfun_fp16s.h
@@ -402,4 +402,23 @@ _RVV_FLOAT16_ATAN2_OP(2, 16)
 _RVV_FLOAT16_ATAN2_OP(4, 8)
 _RVV_FLOAT16_ATAN2_OP(8, 4)
 
+#define _RVV_FLOAT16_REMAINDER_OP(LMUL, MLEN)                                                               \
+    static inline vfloat16m##LMUL##_t remainder_ps(vfloat16m##LMUL##_t x, vfloat16m##LMUL##_t y, size_t vl) \
+    {                                                                                                       \
+        std::vector<__fp16> tmpx(vl);                                                                       \
+        std::vector<__fp16> tmpy(vl);                                                                       \
+        vse16_v_f16m##LMUL(tmpx.data(), x, vl);                                                             \
+        vse16_v_f16m##LMUL(tmpy.data(), y, vl);                                                             \
+        for (size_t i = 0; i < vl; i++)                                                                     \
+        {                                                                                                   \
+            tmpx[i] = (__fp16)remainderf((float)tmpx[i], (float)tmpy[i]);                                   \
+        }                                                                                                   \
+        return vle16_v_f16m##LMUL(tmpx.data(), vl);                                                         \
+    }
+
+_RVV_FLOAT16_REMAINDER_OP(1, 32)
+_RVV_FLOAT16_REMAINDER_OP(2, 16)
+_RVV_FLOAT16_REMAINDER_OP(4, 8)
+_RVV_FLOAT16_REMAINDER_OP(8, 4)
+
 #endif // RVV_MATHFUN_FP16S_H

From a10197b1085891fab9fe5272c9564dc0189d7ab0 Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Tue, 29 Aug 2023 11:23:41 +0800
Subject: [PATCH 08/23] Try support remainder in mips

---
 src/layer/mips/binaryop_mips.cpp |  2 ++
 src/layer/mips/msa_mathfun.h     | 13 +++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/src/layer/mips/binaryop_mips.cpp b/src/layer/mips/binaryop_mips.cpp
index ebce7743708..d10641a3785 100644
--- a/src/layer/mips/binaryop_mips.cpp
+++ b/src/layer/mips/binaryop_mips.cpp
@@ -314,6 +314,7 @@ MAKE_FUNCTION(binary_op_rdiv, y / x, __msa_fdiv_w(y, x))
 MAKE_FUNCTION(binary_op_rpow, (float)pow(y, x), pow_ps(y, x))
 MAKE_FUNCTION(binary_op_atan2, (float)atan2(x, y), atan2_ps(x, y))
 MAKE_FUNCTION(binary_op_ratan2, (float)atan2(y, x), atan2_ps(y, x))
+MAKE_FUNCTION(binary_op_remainder, remainder(x, y), remainder_ps(x, y))
 // *INDENT-ON*
 // clang-format on
 
@@ -337,6 +338,7 @@ static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr,
     if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector<binary_op_rpow>(ptr, ptr1, outptr, aw, bw, ap, bp);
     if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector<binary_op_atan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
     if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector<binary_op_ratan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
+    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector<binary_op_remainder>(ptr, ptr1, outptr, aw, bw, ap, bp);
 
     // should never reach here
 }
diff --git a/src/layer/mips/msa_mathfun.h b/src/layer/mips/msa_mathfun.h
index cab71acbc6b..a005b95564b 100644
--- a/src/layer/mips/msa_mathfun.h
+++ b/src/layer/mips/msa_mathfun.h
@@ -267,4 +267,17 @@ static inline v4f32 atan2_ps(v4f32 a, v4f32 b)
     return (v4f32)__msa_ld_w(tmpx, 0);
 }
 
+static inline v4f32 remainder_ps(v4f32 x, v4f32 y)
+{
+    float tmpx[4];
+    float tmpy[4];
+    __msa_st_w(x, tmpx, 0);
+    __msa_st_w(y, tmpy, 0);
+    tmpx[0] = remainder(tmpx[0], tmpy[0]);
+    tmpx[1] = remainder(tmpx[1], tmpy[1]);
+    tmpx[2] = remainder(tmpx[2], tmpy[2]);
+    tmpx[3] = remainder(tmpx[3], tmpy[3]);
+    return __msa_ld_w(tmpx, 0);
+}
+
 #endif // MSA_MATHFUN_H

From 42950e20765d749bc339266538f31e687b048da5 Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Tue, 29 Aug 2023 11:24:06 +0800
Subject: [PATCH 09/23] Try support remainder in loongarch

---
 src/layer/loongarch/binaryop_loongarch.cpp |  2 ++
 src/layer/loongarch/lsx_mathfun.h          | 13 +++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/src/layer/loongarch/binaryop_loongarch.cpp b/src/layer/loongarch/binaryop_loongarch.cpp
index c5f64c083cd..3d4f191dcda 100644
--- a/src/layer/loongarch/binaryop_loongarch.cpp
+++ b/src/layer/loongarch/binaryop_loongarch.cpp
@@ -314,6 +314,7 @@ MAKE_FUNCTION(binary_op_rdiv, y / x, __lsx_vfdiv_s(y, x))
 MAKE_FUNCTION(binary_op_rpow, (float)pow(y, x), pow_ps(y, x))
 MAKE_FUNCTION(binary_op_atan2, (float)atan2(x, y), atan2_ps(x, y))
 MAKE_FUNCTION(binary_op_ratan2, (float)atan2(y, x), atan2_ps(y, x))
+MAKE_FUNCTION(binary_op_remainder, remainder(x, y), remainder_ps(x, y))
 // *INDENT-ON*
 // clang-format on
 
@@ -337,6 +338,7 @@ static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr,
     if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector<binary_op_rpow>(ptr, ptr1, outptr, aw, bw, ap, bp);
     if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector<binary_op_atan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
     if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector<binary_op_ratan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
+    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector<binary_op_remainder>(ptr, ptr1, outptr, aw, bw, ap, bp);
 
     // should never reach here
 }
diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h
index 194f63bedc3..9a67473fcef 100644
--- a/src/layer/loongarch/lsx_mathfun.h
+++ b/src/layer/loongarch/lsx_mathfun.h
@@ -269,4 +269,17 @@ static inline __m128 atan2_ps(__m128 a, __m128 b)
     return (__m128)__lsx_vld(tmpx, 0);
 }
 
+static inline __m128 remainder_ps(__m128 x, __m128 y)
+{
+    float tmpx[4];
+    float tmpy[4];
+    __lsx_vst(x, tmpx, 0);
+    __lsx_vst(y, tmpy, 0);
+    tmpx[0] = remainder(tmpx[0], tmpy[0]);
+    tmpx[1] = remainder(tmpx[1], tmpy[1]);
+    tmpx[2] = remainder(tmpx[2], tmpy[2]);
+    tmpx[3] = remainder(tmpx[3], tmpy[3]);
+    return __lsx_vld(tmpx, 0);
+}
+
 #endif // LSX_MATHFUN_H

From 1fd5705ba86a258b25e67228f4869de922f8c4fe Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Tue, 29 Aug 2023 11:24:26 +0800
Subject: [PATCH 10/23] Try support remainder in arm

---
 src/layer/arm/binaryop_arm.cpp         |  2 ++
 src/layer/arm/binaryop_arm_asimdhp.cpp |  2 ++
 src/layer/arm/neon_mathfun.h           | 11 +++++++++++
 3 files changed, 15 insertions(+)

diff --git a/src/layer/arm/binaryop_arm.cpp b/src/layer/arm/binaryop_arm.cpp
index c0d2b9bbbb1..f70563fbb2b 100644
--- a/src/layer/arm/binaryop_arm.cpp
+++ b/src/layer/arm/binaryop_arm.cpp
@@ -287,6 +287,7 @@ MAKE_FUNCTION(binary_op_rdiv, y / x, div_ps(y, x))
 MAKE_FUNCTION(binary_op_rpow, (float)powf(y, x), pow_ps(y, x))
 MAKE_FUNCTION(binary_op_atan2, (float)atan2f(x, y), atan2_ps(x, y))
 MAKE_FUNCTION(binary_op_ratan2, (float)atan2f(y, x), atan2_ps(y, x))
+MAKE_FUNCTION(binary_op_remainder, remainderf(x, y), remainder_ps(x, y))
 // *INDENT-ON*
 // clang-format on
 
@@ -310,6 +311,7 @@ static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr,
     if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector<binary_op_rpow>(ptr, ptr1, outptr, aw, bw, ap, bp);
     if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector<binary_op_atan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
     if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector<binary_op_ratan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
+    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector<binary_op_remainder>(ptr, ptr1, outptr, aw, bw, ap, bp);
 
     // should never reach here
 }
diff --git a/src/layer/arm/binaryop_arm_asimdhp.cpp b/src/layer/arm/binaryop_arm_asimdhp.cpp
index 8dc0db288f4..95ccc8df52c 100644
--- a/src/layer/arm/binaryop_arm_asimdhp.cpp
+++ b/src/layer/arm/binaryop_arm_asimdhp.cpp
@@ -330,6 +330,7 @@ MAKE_FUNCTION(binary_op_rdiv_fp16s, y / x, vdiv_f16(y, x), vdivq_f16(y, x))
 MAKE_FUNCTION(binary_op_rpow_fp16s, (__fp16)powf(y, x), vcvt_f16_f32(pow_ps(vcvt_f32_f16(y), vcvt_f32_f16(x))), vcombine_f16(vcvt_f16_f32(pow_ps(vcvt_f32_f16(vget_low_f16(y)), vcvt_f32_f16(vget_low_f16(x)))), vcvt_f16_f32(pow_ps(vcvt_f32_f16(vget_high_f16(y)), vcvt_f32_f16(vget_high_f16(x))))))
 MAKE_FUNCTION(binary_op_atan2_fp16s, (__fp16)atan2f(x, y), vcvt_f16_f32(atan2_ps(vcvt_f32_f16(x), vcvt_f32_f16(y))), vcombine_f16(vcvt_f16_f32(atan2_ps(vcvt_f32_f16(vget_low_f16(x)), vcvt_f32_f16(vget_low_f16(y)))), vcvt_f16_f32(atan2_ps(vcvt_f32_f16(vget_high_f16(x)), vcvt_f32_f16(vget_high_f16(y))))))
 MAKE_FUNCTION(binary_op_ratan2_fp16s, (__fp16)atan2f(y, x), vcvt_f16_f32(atan2_ps(vcvt_f32_f16(y), vcvt_f32_f16(x))), vcombine_f16(vcvt_f16_f32(atan2_ps(vcvt_f32_f16(vget_low_f16(y)), vcvt_f32_f16(vget_low_f16(x)))), vcvt_f16_f32(atan2_ps(vcvt_f32_f16(vget_high_f16(y)), vcvt_f32_f16(vget_high_f16(x))))))
+MAKE_FUNCTION(binary_op_remainder_fp16s, (__fp16)remainderf(x, y), vcvt_f16_f32(remainder_ps(vcvt_f32_f16(x), vcvt_f32_f16(y))), vcombine_f16(vcvt_f16_f32(remainder_ps(vcvt_f32_f16(vget_low_f16(x)), vcvt_f32_f16(vget_low_f16(y)))), vcvt_f16_f32(remainder_ps(vcvt_f32_f16(vget_high_f16(x)), vcvt_f32_f16(vget_high_f16(y))))))
 // *INDENT-ON*
 // clang-format on
 
@@ -353,6 +354,7 @@ static void binary_op_vector_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16
     if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector_fp16s<binary_op_rpow_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
     if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector_fp16s<binary_op_atan2_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
     if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector_fp16s<binary_op_ratan2_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
+    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector_fp16s<binary_op_remainder_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
 
     // should never reach here
 }
diff --git a/src/layer/arm/neon_mathfun.h b/src/layer/arm/neon_mathfun.h
index 78db901d904..e905a516f23 100644
--- a/src/layer/arm/neon_mathfun.h
+++ b/src/layer/arm/neon_mathfun.h
@@ -424,5 +424,16 @@ static inline float32x4_t atan2_ps(float32x4_t a, float32x4_t b)
     return vld1q_f32(tmpx);
 }
 
+static inline float32x4_t remainder_ps(float32x4_t x, float32x4_t y)
+{
+    float tmpx[4];
+    float tmpy[4];
+    vst1q_f32(tmpx, x);
+    vst1q_f32(tmpy, y);
+    for (int i = 0; i < 4; i++)
+        tmpx[i] = remainderf(tmpx[i], tmpy[i]);
+    return vld1q_f32(tmpx);
+}
+
 #include "neon_mathfun_tanh.h"
 #endif // NEON_MATHFUN_H

From 0f3c34fc33255348704d08850a69672e26346959 Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Tue, 12 Sep 2023 21:10:43 +0800
Subject: [PATCH 11/23] Change tests/test_binaryop OP_TYPE_MAX from 12 to 13

---
 tests/test_binaryop.cpp   | 2 +-
 tests/test_binaryop_1.cpp | 2 +-
 tests/test_binaryop_2.cpp | 2 +-
 tests/test_binaryop_3.cpp | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_binaryop.cpp b/tests/test_binaryop.cpp
index beb746388f2..9e257858545 100644
--- a/tests/test_binaryop.cpp
+++ b/tests/test_binaryop.cpp
@@ -15,7 +15,7 @@
 #include "layer/binaryop.h"
 #include "testutil.h"
 
-#define OP_TYPE_MAX 12
+#define OP_TYPE_MAX 13
 
 static int op_type = 0;
 
diff --git a/tests/test_binaryop_1.cpp b/tests/test_binaryop_1.cpp
index 4ce81714111..ad2c434075d 100644
--- a/tests/test_binaryop_1.cpp
+++ b/tests/test_binaryop_1.cpp
@@ -15,7 +15,7 @@
 #include "layer/binaryop.h"
 #include "testutil.h"
 
-#define OP_TYPE_MAX 12
+#define OP_TYPE_MAX 13
 
 static int op_type = 0;
 
diff --git a/tests/test_binaryop_2.cpp b/tests/test_binaryop_2.cpp
index f0730b1436e..b53f3e3259e 100644
--- a/tests/test_binaryop_2.cpp
+++ b/tests/test_binaryop_2.cpp
@@ -15,7 +15,7 @@
 #include "layer/binaryop.h"
 #include "testutil.h"
 
-#define OP_TYPE_MAX 12
+#define OP_TYPE_MAX 13
 
 static int op_type = 0;
 
diff --git a/tests/test_binaryop_3.cpp b/tests/test_binaryop_3.cpp
index d5e58ccc33e..42e6ed53ddc 100644
--- a/tests/test_binaryop_3.cpp
+++ b/tests/test_binaryop_3.cpp
@@ -15,7 +15,7 @@
 #include "layer/binaryop.h"
 #include "testutil.h"
 
-#define OP_TYPE_MAX 12
+#define OP_TYPE_MAX 13
 
 static int op_type = 0;
 

From 66887d94427fc6b1e19d04e184f9d45ff1d9baa4 Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Wed, 13 Sep 2023 19:43:33 +0800
Subject: [PATCH 12/23] Fix build error on riscv

---
 src/layer/riscv/binaryop_riscv.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/layer/riscv/binaryop_riscv.cpp b/src/layer/riscv/binaryop_riscv.cpp
index 31a6af6654d..efb87f0b245 100644
--- a/src/layer/riscv/binaryop_riscv.cpp
+++ b/src/layer/riscv/binaryop_riscv.cpp
@@ -825,6 +825,7 @@ MAKE_FUNCTION(binary_op_rdiv_fp16s, y / x, vfdiv_vv_f16m8(y, x, vl), vfrdiv_vf_f
 MAKE_FUNCTION(binary_op_rpow_fp16s, (__fp16)pow((float)y, (float)x), pow_ps(y, x, vl), pow_ps(vfmv_v_f_f16m8(y, vl), x, vl), pow_ps(y, vfmv_v_f_f16m8(x, vl), vl))
 MAKE_FUNCTION(binary_op_atan2_fp16s, (__fp16)atan2((float)x, (float)y), atan2_ps(x, y, vl), atan2_ps(x, vfmv_v_f_f16m8(y, vl), vl), atan2_ps(vfmv_v_f_f16m8(x, vl), y, vl))
 MAKE_FUNCTION(binary_op_ratan2_fp16s, (__fp16)atan2((float)y, (float)x), atan2_ps(y, x, vl), atan2_ps(vfmv_v_f_f16m8(y, vl), x, vl), atan2_ps(y, vfmv_v_f_f16m8(x, vl), vl))
+MAKE_FUNCTION(binary_op_remainder_fp16s, (__fp16)remainderf((float)x, (float)y), remainder_ps(x, y, vl), remainder_ps(x, vfmv_v_f_f32m8(y, vl), vl), remainder_ps(vfmv_v_f_f32m8(x, vl), y, vl))
 // *INDENT-ON*
 // clang-format on
 

From 5df5c8dd8c537897a91d6b04a1384a2a9aa7ae37 Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Wed, 13 Sep 2023 19:44:06 +0800
Subject: [PATCH 13/23] Add pnnx convertor

---
 tools/pnnx/src/pass_ncnn/expand_expression.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/pnnx/src/pass_ncnn/expand_expression.cpp b/tools/pnnx/src/pass_ncnn/expand_expression.cpp
index f6022be665f..7055f2af4e0 100644
--- a/tools/pnnx/src/pass_ncnn/expand_expression.cpp
+++ b/tools/pnnx/src/pass_ncnn/expand_expression.cpp
@@ -199,7 +199,7 @@ static std::string expand_expression(Graph& graph, const Operator* op, int& pnnx
             if (t == "div") op_binary->params["0"] = 3;
             if (t == "floor_divide") fprintf(stderr, "BinaryOp floor_divide not supported yet\n"); // TODO
             if (t == "fmod") fprintf(stderr, "BinaryOp fmod not supported yet\n");                 // TODO
-            if (t == "remainder") fprintf(stderr, "BinaryOp remainder not supported yet\n");       // TODO
+            if (t == "remainder") op_binary->params["0"] = 12;
             if (t == "pow") op_binary->params["0"] = 6;
             if (t == "atan2") op_binary->params["0"] = 10;
 

From a3e022fb2ea5461bb1efa7050c27a7cf52d190c5 Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Wed, 13 Sep 2023 19:44:47 +0800
Subject: [PATCH 14/23] Add remainder python unittest

---
 tools/pnnx/tests/ncnn/CMakeLists.txt          |  1 +
 tools/pnnx/tests/ncnn/test_torch_remainder.py | 61 +++++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 tools/pnnx/tests/ncnn/test_torch_remainder.py

diff --git a/tools/pnnx/tests/ncnn/CMakeLists.txt b/tools/pnnx/tests/ncnn/CMakeLists.txt
index 945576bfaf6..0dee6c2f52c 100644
--- a/tools/pnnx/tests/ncnn/CMakeLists.txt
+++ b/tools/pnnx/tests/ncnn/CMakeLists.txt
@@ -175,6 +175,7 @@ pnnx_ncnn_add_test(torch_log10)
 pnnx_ncnn_add_test(torch_neg)
 pnnx_ncnn_add_test(torch_pow)
 pnnx_ncnn_add_test(torch_reciprocal)
+pnnx_ncnn_add_test(torch_remainder)
 pnnx_ncnn_add_test(torch_round)
 pnnx_ncnn_add_test(torch_rsqrt)
 pnnx_ncnn_add_test(torch_sin)
diff --git a/tools/pnnx/tests/ncnn/test_torch_remainder.py b/tools/pnnx/tests/ncnn/test_torch_remainder.py
new file mode 100644
index 00000000000..ffa06dba25f
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_remainder.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        out0 = torch.remainder(x, y)
+        out1 = torch.remainder(y, y)
+        out2 = torch.remainder(z, torch.ones_like(z) + 0.5)
+        return out0, out1, out2
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(3, 16)
+    z = torch.rand(5, 9, 3)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_remainder.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_remainder.pt inputshape=[3,16],[3,16],[5,9,3]")
+
+    # ncnn inference
+    import test_torch_remainder_ncnn
+    b = test_torch_remainder_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)

From 21e6ca0834afa4120f91fe903b9615b6695a14ea Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Thu, 21 Sep 2023 13:57:02 +0000
Subject: [PATCH 15/23] Try fix result error on x86

---
 src/layer/binaryop.cpp         | 6 +++++-
 src/layer/x86/avx512_mathfun.h | 2 +-
 src/layer/x86/avx_mathfun.h    | 2 +-
 src/layer/x86/binaryop_x86.cpp | 5 ++++-
 src/layer/x86/sse_mathfun.h    | 2 +-
 5 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/layer/binaryop.cpp b/src/layer/binaryop.cpp
index 80d0588c759..098687e589f 100644
--- a/src/layer/binaryop.cpp
+++ b/src/layer/binaryop.cpp
@@ -243,7 +243,11 @@ struct binary_op_remainder
 {
     float operator()(const float& x, const float& y) const
     {
-        return (float)remainderf(x, y);
+        float div_result = x / y;
+        float round_result = roundf(div_result);
+        float res = x - y * round_result;
+        return res;
+        // return (float)remainderf(x, y);
     }
 };
 
diff --git a/src/layer/x86/avx512_mathfun.h b/src/layer/x86/avx512_mathfun.h
index 725643d9e2d..cc96a02273e 100644
--- a/src/layer/x86/avx512_mathfun.h
+++ b/src/layer/x86/avx512_mathfun.h
@@ -859,7 +859,7 @@ static NCNN_FORCEINLINE __m512 abs512_ps(__m512 x)
 static NCNN_FORCEINLINE __m512 remainder512_ps(__m512 x, __m512 y)
 {
     const __m512 round_div_result = _mm512_div_round_ps(x, y, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
-    const __m512 mul_result = _mm512_mul_ps(round_div_result, y);
+    const __m512 mul_result = _mm512_mul_ps(y, round_div_result);
     return _mm512_sub_ps(x, mul_result);
 }
 
diff --git a/src/layer/x86/avx_mathfun.h b/src/layer/x86/avx_mathfun.h
index 24788cdf7a1..a4debeeb3ce 100644
--- a/src/layer/x86/avx_mathfun.h
+++ b/src/layer/x86/avx_mathfun.h
@@ -1091,7 +1091,7 @@ static NCNN_FORCEINLINE __m256 remainder256_ps(__m256 x, __m256 y)
 {
     const __m256 div_result = _mm256_div_ps(x, y);
     const __m256 round_result = _mm256_round_ps(div_result, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
-    const __m256 mul_result = _mm256_mul_ps(round_result, y);
+    const __m256 mul_result = _mm256_mul_ps(y, round_result);
     return _mm256_sub_ps(x, mul_result);
 }
 
diff --git a/src/layer/x86/binaryop_x86.cpp b/src/layer/x86/binaryop_x86.cpp
index 5123767b23a..d438e6920b1 100644
--- a/src/layer/x86/binaryop_x86.cpp
+++ b/src/layer/x86/binaryop_x86.cpp
@@ -795,7 +795,10 @@ struct binary_op_remainder
 {
     float func(const float& x, const float& y) const
     {
-        return (float)remainderf(x, y);
+        float div_result = x / y;
+        float round_result = roundf(div_result);
+        float res = x - y * round_result;
+        return res;
     }
 #if __SSE2__
     __m128 func_pack4(const __m128& x, const __m128& y) const
diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h
index cc34d7f7989..2c7ef13386d 100644
--- a/src/layer/x86/sse_mathfun.h
+++ b/src/layer/x86/sse_mathfun.h
@@ -1166,7 +1166,7 @@ static NCNN_FORCEINLINE __m128 remainder_ps(__m128 x, __m128 y)
 {
     const __m128 div_result = _mm_div_ps(x, y);
     const __m128 round_result = _mm_round_ps(div_result, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
-    const __m128 mul_result = _mm_mul_ps(round_result, y);
+    const __m128 mul_result = _mm_mul_ps(y, round_result);
     return _mm_sub_ps(x, mul_result);
 }
 

From cea002681c744c5f8e43838885aa66352962343a Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Thu, 21 Sep 2023 14:10:26 +0000
Subject: [PATCH 16/23] Fix args in binaryop.cpp

---
 src/layer/binaryop.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layer/binaryop.cpp b/src/layer/binaryop.cpp
index 098687e589f..b220beb2da5 100644
--- a/src/layer/binaryop.cpp
+++ b/src/layer/binaryop.cpp
@@ -265,7 +265,7 @@ static void binary_op_broadcast(const Mat& a, const Mat& b, Mat& c, int op_type,
     if (op_type == BinaryOp::Operation_RPOW) return binary_op_broadcast<binary_op_pow>(b, a, c, opt);
     if (op_type == BinaryOp::Operation_ATAN2) return binary_op_broadcast<binary_op_atan2>(a, b, c, opt);
     if (op_type == BinaryOp::Operation_RATAN2) return binary_op_broadcast<binary_op_atan2>(b, a, c, opt);
-    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_broadcast<binary_op_remainder>(b, a, c, opt);
+    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_broadcast<binary_op_remainder>(a, b, c, opt);
 
     // should never reach here
 }

From 22ce01f2138136c95e48695e7a0318e66459f4a9 Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Tue, 17 Oct 2023 09:14:51 +0000
Subject: [PATCH 17/23] Fix compute error for remainder on x86

---
 src/CMakeLists.txt             |  2 +-
 src/layer/binaryop.cpp         |  9 ++++-----
 src/layer/x86/avx512_mathfun.h |  5 +++--
 src/layer/x86/avx_mathfun.h    |  4 ++--
 src/layer/x86/binaryop_x86.cpp |  8 ++++----
 src/layer/x86/sse_mathfun.h    | 14 +++++++-------
 6 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 07496791a96..cb7be1f6e9c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -345,7 +345,7 @@ if(NCNN_TARGET_ARCH STREQUAL "x86")
         if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
             target_compile_options(ncnn PRIVATE /arch:SSE2 /D__SSE2__)
         else()
-            target_compile_options(ncnn PRIVATE -msse4.1 -msse2 -msse)
+            target_compile_options(ncnn PRIVATE -msse2 -msse)
             if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
                 target_compile_options(ncnn PRIVATE -msimd128)
             endif()
diff --git a/src/layer/binaryop.cpp b/src/layer/binaryop.cpp
index b220beb2da5..6f9afc55c1a 100644
--- a/src/layer/binaryop.cpp
+++ b/src/layer/binaryop.cpp
@@ -243,11 +243,10 @@ struct binary_op_remainder
 {
     float operator()(const float& x, const float& y) const
     {
-        float div_result = x / y;
-        float round_result = roundf(div_result);
-        float res = x - y * round_result;
-        return res;
-        // return (float)remainderf(x, y);
+        const float div_result = x / y;
+        const float floor_result = floorf(div_result);
+        const float mul_result = floor_result * y;
+        return x - mul_result;
     }
 };
 
diff --git a/src/layer/x86/avx512_mathfun.h b/src/layer/x86/avx512_mathfun.h
index cc96a02273e..4bd8d074202 100644
--- a/src/layer/x86/avx512_mathfun.h
+++ b/src/layer/x86/avx512_mathfun.h
@@ -858,8 +858,9 @@ static NCNN_FORCEINLINE __m512 abs512_ps(__m512 x)
 
 static NCNN_FORCEINLINE __m512 remainder512_ps(__m512 x, __m512 y)
 {
-    const __m512 round_div_result = _mm512_div_round_ps(x, y, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
-    const __m512 mul_result = _mm512_mul_ps(y, round_div_result);
+    const __m512 div_result = _mm512_div_ps(x, y);
+    const __m512 floor_result = _mm512_floor_ps(div_result);
+    const __m512 mul_result = _mm512_mul_ps(y, floor_result);
     return _mm512_sub_ps(x, mul_result);
 }
 
diff --git a/src/layer/x86/avx_mathfun.h b/src/layer/x86/avx_mathfun.h
index a4debeeb3ce..d3708f7fb1b 100644
--- a/src/layer/x86/avx_mathfun.h
+++ b/src/layer/x86/avx_mathfun.h
@@ -1090,8 +1090,8 @@ static NCNN_FORCEINLINE __m256 abs256_ps(__m256 x)
 static NCNN_FORCEINLINE __m256 remainder256_ps(__m256 x, __m256 y)
 {
     const __m256 div_result = _mm256_div_ps(x, y);
-    const __m256 round_result = _mm256_round_ps(div_result, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
-    const __m256 mul_result = _mm256_mul_ps(y, round_result);
+    const __m256 floor_result = _mm256_floor_ps(div_result);
+    const __m256 mul_result = _mm256_mul_ps(y, floor_result);
     return _mm256_sub_ps(x, mul_result);
 }
 
diff --git a/src/layer/x86/binaryop_x86.cpp b/src/layer/x86/binaryop_x86.cpp
index d438e6920b1..c258f71a2fa 100644
--- a/src/layer/x86/binaryop_x86.cpp
+++ b/src/layer/x86/binaryop_x86.cpp
@@ -795,10 +795,10 @@ struct binary_op_remainder
 {
     float func(const float& x, const float& y) const
     {
-        float div_result = x / y;
-        float round_result = roundf(div_result);
-        float res = x - y * round_result;
-        return res;
+        const float div_result = x / y;
+        const float floor_result = floorf(div_result);
+        const float mul_result = floor_result * y;
+        return x - mul_result;
     }
 #if __SSE2__
     __m128 func_pack4(const __m128& x, const __m128& y) const
diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h
index 2c7ef13386d..5ca090ef206 100644
--- a/src/layer/x86/sse_mathfun.h
+++ b/src/layer/x86/sse_mathfun.h
@@ -33,7 +33,6 @@
 #define SSE_MATHFUN_H
 
 #define USE_SSE2 1
-#define USE_SSE4 1
 
 #include <xmmintrin.h>
 #include <x86_usability.h>
@@ -58,10 +57,6 @@ typedef __m128i v4si; // vector of 4 int (sse2)
 typedef __m64 v2si; // vector of 2 int (mmx)
 #endif
 
-#ifdef USE_SSE4
-#include <smmintrin.h>
-#endif
-
 /* declare some SSE constants -- why can't I figure a better way to do that? */
 #define _PS_CONST(Name, Val) \
     static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = {Val, Val, Val, Val}
@@ -1165,8 +1160,13 @@ static NCNN_FORCEINLINE __m128 abs_ps(__m128 inputs)
 static NCNN_FORCEINLINE __m128 remainder_ps(__m128 x, __m128 y)
 {
     const __m128 div_result = _mm_div_ps(x, y);
-    const __m128 round_result = _mm_round_ps(div_result, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
-    const __m128 mul_result = _mm_mul_ps(y, round_result);
+    // Need SSE4.1
+    // const __m128 floor_result = _mm_floor_ps(div_result);
+    const __m128 trunc_result = _mm_cvtepi32_ps(_mm_cvttps_epi32(div_result));
+    const __m128 cmp = _mm_cmplt_ps(div_result, trunc_result);
+    const __m128 one = _mm_set1_ps(1.0f);
+    const __m128 floor_result = _mm_sub_ps(trunc_result, _mm_and_ps(cmp, one));
+    const __m128 mul_result = _mm_mul_ps(y, floor_result);
     return _mm_sub_ps(x, mul_result);
 }
 

From 6097a1761b43a4a85bba19ea8ad4a3674fa332b8 Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Tue, 17 Oct 2023 09:21:54 +0000
Subject: [PATCH 18/23] Fix compute error for remainder on vulkan

---
 src/layer/vulkan/shader/binaryop.comp                    | 2 +-
 src/layer/vulkan/shader/binaryop_broadcast.comp          | 2 +-
 src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp | 2 +-
 src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp | 4 ++--
 src/layer/vulkan/shader/binaryop_broadcast_pack4.comp    | 2 +-
 src/layer/vulkan/shader/binaryop_broadcast_pack8.comp    | 4 ++--
 src/layer/vulkan/shader/binaryop_pack4.comp              | 2 +-
 src/layer/vulkan/shader/binaryop_pack8.comp              | 4 ++--
 8 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/layer/vulkan/shader/binaryop.comp b/src/layer/vulkan/shader/binaryop.comp
index 02c986817a3..a6632e0ee1d 100644
--- a/src/layer/vulkan/shader/binaryop.comp
+++ b/src/layer/vulkan/shader/binaryop.comp
@@ -137,7 +137,7 @@ void main()
     if (op_type == 10) res = atan(v1, v2);
     if (op_type == 11) res = atan(v2, v1);
 #endif
-    if (op_type == 12) res = mod(v1, v2);
+    if (op_type == 12) res = v1 - floorf(v1 / v2) * v2;
 
 #if NCNN_image_shader
     image3d_st1(top_blob_3d, ivec3(gx, gy, gz), res);
diff --git a/src/layer/vulkan/shader/binaryop_broadcast.comp b/src/layer/vulkan/shader/binaryop_broadcast.comp
index d15d8e855ca..93edcb886ab 100644
--- a/src/layer/vulkan/shader/binaryop_broadcast.comp
+++ b/src/layer/vulkan/shader/binaryop_broadcast.comp
@@ -199,7 +199,7 @@ void main()
     if (op_type == 10) res = atan(v1, v2);
     if (op_type == 11) res = atan(v2, v1);
 #endif
-    if (op_type == 12) res = mod(v1, v2);
+    if (op_type == 12) res = v1 - floorf(v1 / v2) * v2;
 
 #if NCNN_image_shader
     image3d_st1(top_blob_3d, ivec3(gx, gy, gz), res);
diff --git a/src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp b/src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp
index 6279153b800..64cd537b71f 100644
--- a/src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp
+++ b/src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp
@@ -130,7 +130,7 @@ void main()
     if (op_type == 10) res = atan(v1, v2);
     if (op_type == 11) res = atan(v2, v1);
 #endif
-    if (op_type == 12) res = mod(v1, v2);
+    if (op_type == 12) res = v1 - floorf(v1 / v2) * v2;
 
 #if NCNN_image_shader
     image3d_st4(top_blob_3d, ivec3(gx, gy, gz), res);
diff --git a/src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp b/src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp
index 9aa70946404..f497e0726b2 100644
--- a/src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp
+++ b/src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp
@@ -189,8 +189,8 @@ void main()
     }
     if (op_type == 12)
     {
-        res[0] = mod(v1[0], v2[0]);
-        res[1] = mod(v1[1], v2[1]);
+        res[0] = v1[0] - floorf(v1[0] / v2[0]) * v2[0];
+        res[1] = v1[1] - floorf(v1[1] / v2[1]) * v2[1];
     }
 
 #if NCNN_image_shader
diff --git a/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp b/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp
index bcba406a748..4a9116dfc4f 100644
--- a/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp
+++ b/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp
@@ -199,7 +199,7 @@ void main()
     if (op_type == 10) res = atan(v1, v2);
     if (op_type == 11) res = atan(v2, v1);
 #endif
-    if (op_type == 12) res = mod(v1, v2);
+    if (op_type == 12) res = v1 - floorf(v1 / v2) * v2;
 
 #if NCNN_image_shader
     image3d_st4(top_blob_3d, ivec3(gx, gy, gz), res);
diff --git a/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp b/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp
index f5cbd166b34..1b6ca17bcbb 100644
--- a/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp
+++ b/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp
@@ -255,8 +255,8 @@ void main()
     }
     if (op_type == 12)
     {
-        res[0] = mod(v1[0], v2[0]);
-        res[1] = mod(v1[1], v2[1]);
+        res[0] = v1[0] - floorf(v1[0] / v2[0]) * v2[0];
+        res[1] = v1[1] - floorf(v1[1] / v2[1]) * v2[1];
     }
 
 #if NCNN_image_shader
diff --git a/src/layer/vulkan/shader/binaryop_pack4.comp b/src/layer/vulkan/shader/binaryop_pack4.comp
index a1df2479aeb..f86d53ac8b7 100644
--- a/src/layer/vulkan/shader/binaryop_pack4.comp
+++ b/src/layer/vulkan/shader/binaryop_pack4.comp
@@ -128,7 +128,7 @@ void main()
     if (op_type == 10) res = atan(v1, v2);
     if (op_type == 11) res = atan(v2, v1);
 #endif
-    if (op_type == 12) res = mod(v1, v2);
+    if (op_type == 12) res = v1 - floorf(v1 / v2) * v2;
 
 #if NCNN_image_shader
     image3d_st4(top_blob_3d, ivec3(gx, gy, gz), res);
diff --git a/src/layer/vulkan/shader/binaryop_pack8.comp b/src/layer/vulkan/shader/binaryop_pack8.comp
index 924a01f2534..1be2bd5fc18 100644
--- a/src/layer/vulkan/shader/binaryop_pack8.comp
+++ b/src/layer/vulkan/shader/binaryop_pack8.comp
@@ -185,8 +185,8 @@ void main()
     }
     if (op_type == 12)
     {
-        res[0] = mod(v1[0], v2[0]);
-        res[1] = mod(v1[1], v2[1]);
+        res[0] = v1[0] - floorf(v1[0] / v2[0]) * v2[0];
+        res[1] = v1[1] - floorf(v1[1] / v2[1]) * v2[1];
     }
 
 #if NCNN_image_shader

From 77f908deaacd64c31b0a60af300f7f06306e50a3 Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Tue, 17 Oct 2023 09:25:52 +0000
Subject: [PATCH 19/23] Fix compute error for remainder on loongarch

---
 src/layer/loongarch/binaryop_loongarch.cpp | 2 +-
 src/layer/loongarch/lsx_mathfun.h          | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/layer/loongarch/binaryop_loongarch.cpp b/src/layer/loongarch/binaryop_loongarch.cpp
index 3d4f191dcda..d7d3bb6af7b 100644
--- a/src/layer/loongarch/binaryop_loongarch.cpp
+++ b/src/layer/loongarch/binaryop_loongarch.cpp
@@ -314,7 +314,7 @@ MAKE_FUNCTION(binary_op_rdiv, y / x, __lsx_vfdiv_s(y, x))
 MAKE_FUNCTION(binary_op_rpow, (float)pow(y, x), pow_ps(y, x))
 MAKE_FUNCTION(binary_op_atan2, (float)atan2(x, y), atan2_ps(x, y))
 MAKE_FUNCTION(binary_op_ratan2, (float)atan2(y, x), atan2_ps(y, x))
-MAKE_FUNCTION(binary_op_remainder, remainder(x, y), remainder_ps(x, y))
+MAKE_FUNCTION(binary_op_remainder, remainderf(x, y), remainder_ps(x, y))
 // *INDENT-ON*
 // clang-format on
 
diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h
index 9a67473fcef..8c15d94adb9 100644
--- a/src/layer/loongarch/lsx_mathfun.h
+++ b/src/layer/loongarch/lsx_mathfun.h
@@ -275,10 +275,10 @@ static inline __m128 remainder_ps(__m128 x, __m128 y)
     float tmpy[4];
     __lsx_vst(x, tmpx, 0);
     __lsx_vst(y, tmpy, 0);
-    tmpx[0] = remainder(tmpx[0], tmpy[0]);
-    tmpx[1] = remainder(tmpx[1], tmpy[1]);
-    tmpx[2] = remainder(tmpx[2], tmpy[2]);
-    tmpx[3] = remainder(tmpx[3], tmpy[3]);
+    tmpx[0] = remainderf(tmpx[0], tmpy[0]);
+    tmpx[1] = remainderf(tmpx[1], tmpy[1]);
+    tmpx[2] = remainderf(tmpx[2], tmpy[2]);
+    tmpx[3] = remainderf(tmpx[3], tmpy[3]);
     return __lsx_vld(tmpx, 0);
 }
 

From 8f7770da4f50147b94a83d580096edc40ef2145e Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Tue, 17 Oct 2023 09:26:14 +0000
Subject: [PATCH 20/23] Fix compute error for remainder on mips

---
 src/layer/mips/binaryop_mips.cpp | 2 +-
 src/layer/mips/msa_mathfun.h     | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/layer/mips/binaryop_mips.cpp b/src/layer/mips/binaryop_mips.cpp
index d10641a3785..e407bc962c2 100644
--- a/src/layer/mips/binaryop_mips.cpp
+++ b/src/layer/mips/binaryop_mips.cpp
@@ -314,7 +314,7 @@ MAKE_FUNCTION(binary_op_rdiv, y / x, __msa_fdiv_w(y, x))
 MAKE_FUNCTION(binary_op_rpow, (float)pow(y, x), pow_ps(y, x))
 MAKE_FUNCTION(binary_op_atan2, (float)atan2(x, y), atan2_ps(x, y))
 MAKE_FUNCTION(binary_op_ratan2, (float)atan2(y, x), atan2_ps(y, x))
-MAKE_FUNCTION(binary_op_remainder, remainder(x, y), remainder_ps(x, y))
+MAKE_FUNCTION(binary_op_remainder, remainderf(x, y), remainder_ps(x, y))
 // *INDENT-ON*
 // clang-format on
 
diff --git a/src/layer/mips/msa_mathfun.h b/src/layer/mips/msa_mathfun.h
index a005b95564b..9350fd15c7e 100644
--- a/src/layer/mips/msa_mathfun.h
+++ b/src/layer/mips/msa_mathfun.h
@@ -273,10 +273,10 @@ static inline v4f32 remainder_ps(v4f32 x, v4f32 y)
     float tmpy[4];
     __msa_st_w(x, tmpx, 0);
     __msa_st_w(y, tmpy, 0);
-    tmpx[0] = remainder(tmpx[0], tmpy[0]);
-    tmpx[1] = remainder(tmpx[1], tmpy[1]);
-    tmpx[2] = remainder(tmpx[2], tmpy[2]);
-    tmpx[3] = remainder(tmpx[3], tmpy[3]);
+    tmpx[0] = remainderf(tmpx[0], tmpy[0]);
+    tmpx[1] = remainderf(tmpx[1], tmpy[1]);
+    tmpx[2] = remainderf(tmpx[2], tmpy[2]);
+    tmpx[3] = remainderf(tmpx[3], tmpy[3]);
     return __msa_ld_w(tmpx, 0);
 }
 

From a370f4184560035228d5c03a3e03d2885207931f Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Tue, 17 Oct 2023 09:26:43 +0000
Subject: [PATCH 21/23] Preprocess divisor for remainder on unittest

---
 tests/test_binaryop_3.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/test_binaryop_3.cpp b/tests/test_binaryop_3.cpp
index 42e6ed53ddc..d4bf5faceba 100644
--- a/tests/test_binaryop_3.cpp
+++ b/tests/test_binaryop_3.cpp
@@ -55,6 +55,14 @@ static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b, int flag)
                 b[i] = 0.001f;
         }
     }
+    if (op_type == 12) {
+        // divisor must be non-zero for remainder
+        b = b.clone();
+        for (int i = 0; i < b.total(); i++) {
+            if (b[i] == 0.f)
+                b[i] = 0.001f;
+        }
+    }
 
     ncnn::ParamDict pd;
     pd.set(0, op_type);

From b4b198e9b4d9b118d6d5a731faf5585768881b33 Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Wed, 18 Oct 2023 03:06:13 +0000
Subject: [PATCH 22/23] Fix build error

---
 src/layer/loongarch/lsx_mathfun.h  | 2 +-
 src/layer/mips/msa_mathfun.h       | 6 +++---
 src/layer/riscv/binaryop_riscv.cpp | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h
index 8c15d94adb9..a09cd6d711a 100644
--- a/src/layer/loongarch/lsx_mathfun.h
+++ b/src/layer/loongarch/lsx_mathfun.h
@@ -279,7 +279,7 @@ static inline __m128 remainder_ps(__m128 x, __m128 y)
     tmpx[1] = remainderf(tmpx[1], tmpy[1]);
     tmpx[2] = remainderf(tmpx[2], tmpy[2]);
     tmpx[3] = remainderf(tmpx[3], tmpy[3]);
-    return __lsx_vld(tmpx, 0);
+    return (__m128)__lsx_vld(tmpx, 0);
 }
 
 #endif // LSX_MATHFUN_H
diff --git a/src/layer/mips/msa_mathfun.h b/src/layer/mips/msa_mathfun.h
index 9350fd15c7e..07b30ea7f85 100644
--- a/src/layer/mips/msa_mathfun.h
+++ b/src/layer/mips/msa_mathfun.h
@@ -271,13 +271,13 @@ static inline v4f32 remainder_ps(v4f32 x, v4f32 y)
 {
     float tmpx[4];
     float tmpy[4];
-    __msa_st_w(x, tmpx, 0);
-    __msa_st_w(y, tmpy, 0);
+    __msa_st_w((v4i32)x, tmpx, 0);
+    __msa_st_w((v4i32)y, tmpy, 0);
     tmpx[0] = remainderf(tmpx[0], tmpy[0]);
     tmpx[1] = remainderf(tmpx[1], tmpy[1]);
     tmpx[2] = remainderf(tmpx[2], tmpy[2]);
     tmpx[3] = remainderf(tmpx[3], tmpy[3]);
-    return __msa_ld_w(tmpx, 0);
+    return (v4f32)__msa_ld_w(tmpx, 0);
 }
 
 #endif // MSA_MATHFUN_H
diff --git a/src/layer/riscv/binaryop_riscv.cpp b/src/layer/riscv/binaryop_riscv.cpp
index efb87f0b245..e9b3e17e16b 100644
--- a/src/layer/riscv/binaryop_riscv.cpp
+++ b/src/layer/riscv/binaryop_riscv.cpp
@@ -825,7 +825,7 @@ MAKE_FUNCTION(binary_op_rdiv_fp16s, y / x, vfdiv_vv_f16m8(y, x, vl), vfrdiv_vf_f
 MAKE_FUNCTION(binary_op_rpow_fp16s, (__fp16)pow((float)y, (float)x), pow_ps(y, x, vl), pow_ps(vfmv_v_f_f16m8(y, vl), x, vl), pow_ps(y, vfmv_v_f_f16m8(x, vl), vl))
 MAKE_FUNCTION(binary_op_atan2_fp16s, (__fp16)atan2((float)x, (float)y), atan2_ps(x, y, vl), atan2_ps(x, vfmv_v_f_f16m8(y, vl), vl), atan2_ps(vfmv_v_f_f16m8(x, vl), y, vl))
 MAKE_FUNCTION(binary_op_ratan2_fp16s, (__fp16)atan2((float)y, (float)x), atan2_ps(y, x, vl), atan2_ps(vfmv_v_f_f16m8(y, vl), x, vl), atan2_ps(y, vfmv_v_f_f16m8(x, vl), vl))
-MAKE_FUNCTION(binary_op_remainder_fp16s, (__fp16)remainderf((float)x, (float)y), remainder_ps(x, y, vl), remainder_ps(x, vfmv_v_f_f32m8(y, vl), vl), remainder_ps(vfmv_v_f_f32m8(x, vl), y, vl))
+MAKE_FUNCTION(binary_op_remainder_fp16s, (__fp16)remainderf((float)x, (float)y), remainder_ps(x, y, vl), remainder_ps(x, vfmv_v_f_f16m8(y, vl), vl), remainder_ps(vfmv_v_f_f16m8(x, vl), y, vl))
 // *INDENT-ON*
 // clang-format on
 

From d917d97a11a23a4f356a7888859de7a1e03b3552 Mon Sep 17 00:00:00 2001
From: nihui <nihui@users.noreply.github.com>
Date: Fri, 22 Dec 2023 03:44:56 +0000
Subject: [PATCH 23/23] apply code-format changes

---
 tests/test_binaryop_3.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_binaryop_3.cpp b/tests/test_binaryop_3.cpp
index dd5b7abc8fe..1d557a2002c 100644
--- a/tests/test_binaryop_3.cpp
+++ b/tests/test_binaryop_3.cpp
@@ -55,10 +55,12 @@ static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b, int flag)
                 b[i] = 0.001f;
         }
     }
-    if (op_type == 12) {
+    if (op_type == 12)
+    {
         // divisor must be non-zero for remainder
         b = b.clone();
-        for (int i = 0; i < b.total(); i++) {
+        for (int i = 0; i < b.total(); i++)
+        {
             if (b[i] == 0.f)
                 b[i] = 0.001f;
         }