From 242dd3cad870622cdaf556da8088673a7a33e791 Mon Sep 17 00:00:00 2001 From: Fisher Date: Thu, 3 Aug 2023 22:59:38 +0800 Subject: [PATCH 01/23] Add draft code for op remainder on x86 --- src/layer/x86/remainder_x86.cpp | 159 ++++++++++++++++++++++++++++++++ src/layer/x86/remainder_x86.h | 18 ++++ 2 files changed, 177 insertions(+) create mode 100644 src/layer/x86/remainder_x86.cpp create mode 100644 src/layer/x86/remainder_x86.h diff --git a/src/layer/x86/remainder_x86.cpp b/src/layer/x86/remainder_x86.cpp new file mode 100644 index 00000000000..6cd5cb08be0 --- /dev/null +++ b/src/layer/x86/remainder_x86.cpp @@ -0,0 +1,159 @@ +#include "remainder_x86.h" + +#if __SSE2__ +#include +#if __AVX__ +#include +#endif // __AVX__ +#endif // __SSE2__ +#include "x86_usability.h" + +namespace ncnn { + +Remainder_x86::Remainder_x86() +{ +#if __SSE2__ + support_packing = true; +#endif // __SSE2__ +} + +int Remainder_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int elempack = bottom_blob.elempack; + int size = w * h * d * elempack; + + Mat& top_blob = top_blobs[0]; + top_blob.create_like(bottom_blob, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // first blob + const Mat& bottom_blob1 = bottom_blobs[1]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + const float* ptr1 = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _p1 = _mm512_loadu_ps(ptr1); + // TODO: Instruction for remainder + // _p = xxxxx(_p, _p1); + // _mm512_storeu_ps(outptr, _p); + + ptr += 16; + ptr1 += 16; + outptr += 16; + } +#endif // __AVX512F__ + for (; i + 7 < size; i += 8) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _p1 = _mm256_loadu_ps(ptr1); + // TODO: Instruction for remainder + // _p = xxxxx(_p, _p1); + // _mm256_storeu_ps(outptr, _p); + + ptr += 8; + ptr1 += 8; + outptr += 8; + } +#endif // __AVX__ + for (; i + 3 < size; i += 4) + { + __m128 _p = _mm_load_ps(ptr); + __m128 _p1 = _mm_load_ps(ptr1); + // TODO: Instruction for remainder + // _p = xxxxx(_p, _p1); + // _mm_store_ps(outptr, _p); + + ptr += 4; + ptr1 += 4; + outptr += 4; + } +#endif // __SSE2__ + for (; i < size; i++) + { + *outptr = remainderf(*ptr, *ptr1); + + ptr++; + ptr1++; + outptr++; + } + } + + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob2 = bottom_blobs[b]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob2.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(outptr); + __m512 _p1 = _mm512_loadu_ps(ptr); + // TODO: Instruction for remainder + // _p = xxxxx(_p, _p1); + // _mm512_storeu_ps(outptr, _p); + + ptr += 16; + outptr += 16; + } +#endif // __AVX512F__ + for (; i + 7 < size; i += 8) + { + __m256 _p = _mm256_loadu_ps(outptr); + __m256 _p1 = _mm256_loadu_ps(ptr); + // TODO: Instruction for remainder + // _p = xxxxx(_p, _p1); + // _mm256_storeu_ps(outptr, _p); + + ptr += 8; + outptr += 8; + } +#endif // __AVX__ + for (; i + 3 < size; i += 4) + { + __m128 _p = _mm_load_ps(outptr); + __m128 _p1 = _mm_load_ps(ptr); + // TODO: Instruction for remainder + // _p = xxxxx(_p, _p1); + // _mm_store_ps(outptr, _p); + + ptr += 4; + outptr += 4; + } +#endif // __SSE2__ + for (; i < size; i++) + { + *outptr = remainderf(*outptr, *ptr); + + ptr++; + outptr++; + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/x86/remainder_x86.h b/src/layer/x86/remainder_x86.h new file mode 100644 index 00000000000..2a260eccf4f --- /dev/null +++ b/src/layer/x86/remainder_x86.h @@ -0,0 +1,18 @@ +#ifndef LAYER_ELTWISE_X86_H +#define LAYER_ELTWISE_X86_H + +#include "eltwise.h" + +namespace ncnn { + +class Remainder_x86 : virtual public Eltwise +{ +public: + Remainder_x86(); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_ELTWISE_X86_H From 3d77066d86169926952cad11868f099cf5879979 Mon Sep 17 00:00:00 2001 From: Fisher Date: Fri, 4 Aug 2023 23:10:50 +0800 Subject: [PATCH 02/23] Remove old remainder_x86 --- src/layer/x86/remainder_x86.cpp | 159 -------------------------------- src/layer/x86/remainder_x86.h | 18 ---- 2 files changed, 177 deletions(-) delete mode 100644 src/layer/x86/remainder_x86.cpp delete mode 100644 src/layer/x86/remainder_x86.h diff --git a/src/layer/x86/remainder_x86.cpp b/src/layer/x86/remainder_x86.cpp deleted file mode 100644 index 6cd5cb08be0..00000000000 --- a/src/layer/x86/remainder_x86.cpp +++ /dev/null @@ -1,159 +0,0 @@ -#include "remainder_x86.h" - -#if __SSE2__ -#include -#if __AVX__ -#include -#endif // __AVX__ -#endif // __SSE2__ -#include "x86_usability.h" - -namespace ncnn { - -Remainder_x86::Remainder_x86() -{ -#if __SSE2__ - support_packing = true; -#endif // __SSE2__ -} - -int Remainder_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const -{ - const Mat& bottom_blob = bottom_blobs[0]; - int w = bottom_blob.w; - int h = bottom_blob.h; - int d = bottom_blob.d; - int channels = bottom_blob.c; - int elempack = bottom_blob.elempack; - int size = w * h * d * elempack; - - Mat& top_blob = top_blobs[0]; - top_blob.create_like(bottom_blob, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - // first blob - const Mat& bottom_blob1 = bottom_blobs[1]; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = bottom_blob.channel(q); - const float* ptr1 = bottom_blob1.channel(q); - float* outptr = top_blob.channel(q); - - int i = 0; -#if __SSE2__ -#if __AVX__ -#if __AVX512F__ - for (; i + 15 < size; i += 16) - { - __m512 _p = _mm512_loadu_ps(ptr); - __m512 _p1 = _mm512_loadu_ps(ptr1); - // TODO: Instruction for remainder - // _p = xxxxx(_p, _p1); - // _mm512_storeu_ps(outptr, _p); - - ptr += 16; - ptr1 += 16; - outptr += 16; - } -#endif // __AVX512F__ - for (; i + 7 < size; i += 8) - { - __m256 _p = _mm256_loadu_ps(ptr); - __m256 _p1 = _mm256_loadu_ps(ptr1); - // TODO: Instruction for remainder - // _p = xxxxx(_p, _p1); - // _mm256_storeu_ps(outptr, _p); - - ptr += 8; - ptr1 += 8; - outptr += 8; - } -#endif // __AVX__ - for (; i + 3 < size; i += 4) - { - __m128 _p = _mm_load_ps(ptr); - __m128 _p1 = _mm_load_ps(ptr1); - // TODO: Instruction for remainder - // _p = xxxxx(_p, _p1); - // _mm_store_ps(outptr, _p); - - ptr += 4; - ptr1 += 4; - outptr += 4; - } -#endif // __SSE2__ - for (; i < size; i++) - { - *outptr = remainderf(*ptr, *ptr1); - - ptr++; - ptr1++; - outptr++; - } - } - - for (size_t b = 2; b < bottom_blobs.size(); b++) - { - const Mat& bottom_blob2 = bottom_blobs[b]; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const float* ptr = bottom_blob2.channel(q); - float* outptr = top_blob.channel(q); - - int i = 0; -#if __SSE2__ -#if __AVX__ -#if __AVX512F__ - for (; i + 15 < size; i += 16) - { - __m512 _p = _mm512_loadu_ps(outptr); - __m512 _p1 = _mm512_loadu_ps(ptr); - // TODO: Instruction for remainder - // _p = xxxxx(_p, _p1); - // _mm512_storeu_ps(outptr, _p); - - ptr += 16; - outptr += 16; - } -#endif // __AVX512F__ - for (; i + 7 < size; i += 8) - { - __m256 _p = _mm256_loadu_ps(outptr); - __m256 _p1 = _mm256_loadu_ps(ptr); - // TODO: Instruction for remainder - // _p = xxxxx(_p, _p1); - // _mm256_storeu_ps(outptr, _p); - - ptr += 8; - outptr += 8; - } -#endif // __AVX__ - for (; i + 3 < size; i += 4) - { - __m128 _p = _mm_load_ps(outptr); - __m128 _p1 = _mm_load_ps(ptr); - // TODO: Instruction for remainder - // _p = xxxxx(_p, _p1); - // _mm_store_ps(outptr, _p); - - ptr += 4; - outptr += 4; - } -#endif // __SSE2__ - for (; i < size; i++) - { - *outptr = remainderf(*outptr, *ptr); - - ptr++; - outptr++; - } - } - } - - return 0; -} - -} // namespace ncnn diff --git a/src/layer/x86/remainder_x86.h b/src/layer/x86/remainder_x86.h deleted file mode 100644 index 2a260eccf4f..00000000000 --- a/src/layer/x86/remainder_x86.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef LAYER_ELTWISE_X86_H -#define LAYER_ELTWISE_X86_H - -#include "eltwise.h" - -namespace ncnn { - -class Remainder_x86 : virtual public Eltwise -{ -public: - Remainder_x86(); - - virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; -}; - -} // namespace ncnn - -#endif // LAYER_ELTWISE_X86_H From 173d199f2a385774242b225fefc6a0253d72a4b2 Mon Sep 17 00:00:00 2001 From: Fisher Date: Fri, 4 Aug 2023 23:11:10 +0800 Subject: [PATCH 03/23] Refactor remainder_x86 to binary op --- src/layer/binaryop.cpp | 10 ++++++++++ src/layer/binaryop.h | 3 ++- src/layer/x86/avx512_mathfun.h | 7 +++++++ src/layer/x86/avx_mathfun.h | 8 ++++++++ src/layer/x86/binaryop_x86.cpp | 27 +++++++++++++++++++++++++++ src/layer/x86/sse_mathfun.h | 10 ++++++++++ 6 files changed, 64 insertions(+), 1 deletion(-) diff --git a/src/layer/binaryop.cpp b/src/layer/binaryop.cpp index 442d43b500c..80d0588c759 100644 --- a/src/layer/binaryop.cpp +++ b/src/layer/binaryop.cpp @@ -239,6 +239,14 @@ struct binary_op_ratan2 } }; +struct binary_op_remainder +{ + float operator()(const float& x, const float& y) const + { + return (float)remainderf(x, y); + } +}; + static void binary_op_broadcast(const Mat& a, const Mat& b, Mat& c, int op_type, const Option& opt) { if (op_type == BinaryOp::Operation_ADD) return binary_op_broadcast(a, b, c, opt); @@ -253,6 +261,7 @@ static void binary_op_broadcast(const Mat& a, const Mat& b, Mat& c, int op_type, if (op_type == BinaryOp::Operation_RPOW) return binary_op_broadcast(b, a, c, opt); if (op_type == BinaryOp::Operation_ATAN2) return binary_op_broadcast(a, b, c, opt); if (op_type == BinaryOp::Operation_RATAN2) return binary_op_broadcast(b, a, c, opt); + if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_broadcast(b, a, c, opt); // should never reach here } @@ -271,6 +280,7 @@ static void binary_op_scalar_inplace(Mat& bottom_top_blob, float b, int op_type, if (op_type == BinaryOp::Operation_RPOW) return binary_op_scalar_inplace(bottom_top_blob, b, opt); if (op_type == BinaryOp::Operation_ATAN2) return binary_op_scalar_inplace(bottom_top_blob, b, opt); if (op_type == BinaryOp::Operation_RATAN2) return binary_op_scalar_inplace(bottom_top_blob, b, opt); + if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_scalar_inplace(bottom_top_blob, b, opt); // should never reach here } diff --git a/src/layer/binaryop.h b/src/layer/binaryop.h index 5fc06918d20..f22d970be6c 100644 --- a/src/layer/binaryop.h +++ b/src/layer/binaryop.h @@ -45,7 +45,8 @@ class BinaryOp : public Layer Operation_RDIV = 8, Operation_RPOW = 9, Operation_ATAN2 = 10, - Operation_RATAN2 = 11 + Operation_RATAN2 = 11, + Operation_REMAINDER = 12 }; public: diff --git a/src/layer/x86/avx512_mathfun.h b/src/layer/x86/avx512_mathfun.h index b5e47bdbe68..725643d9e2d 100644 --- a/src/layer/x86/avx512_mathfun.h +++ b/src/layer/x86/avx512_mathfun.h @@ -856,4 +856,11 @@ static NCNN_FORCEINLINE __m512 abs512_ps(__m512 x) return _mm512_andnot_ps(magic_negative_zero, x); } +static NCNN_FORCEINLINE __m512 remainder512_ps(__m512 x, __m512 y) +{ + const __m512 round_div_result = _mm512_div_round_ps(x, y, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); + const __m512 mul_result = _mm512_mul_ps(round_div_result, y); + return _mm512_sub_ps(x, mul_result); +} + #endif // AVX512_MATHFUN_H diff --git a/src/layer/x86/avx_mathfun.h b/src/layer/x86/avx_mathfun.h index 65c34efc23e..24788cdf7a1 100644 --- a/src/layer/x86/avx_mathfun.h +++ b/src/layer/x86/avx_mathfun.h @@ -1087,4 +1087,12 @@ static NCNN_FORCEINLINE __m256 abs256_ps(__m256 x) return _mm256_andnot_ps(magic_negative_zero, x); } +static NCNN_FORCEINLINE __m256 remainder256_ps(__m256 x, __m256 y) +{ + const __m256 div_result = _mm256_div_ps(x, y); + const __m256 round_result = _mm256_round_ps(div_result, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); + const __m256 mul_result = _mm256_mul_ps(round_result, y); + return _mm256_sub_ps(x, mul_result); +} + #endif // AVX_MATHFUN_H diff --git a/src/layer/x86/binaryop_x86.cpp b/src/layer/x86/binaryop_x86.cpp index 328b0484a3f..5123767b23a 100644 --- a/src/layer/x86/binaryop_x86.cpp +++ b/src/layer/x86/binaryop_x86.cpp @@ -791,6 +791,32 @@ struct binary_op_ratan2 #endif // __SSE2__ }; +struct binary_op_remainder +{ + float func(const float& x, const float& y) const + { + return (float)remainderf(x, y); + } +#if __SSE2__ + __m128 func_pack4(const __m128& x, const __m128& y) const + { + return remainder_ps(x, y); + } +#if __AVX__ + __m256 func_pack8(const __m256& x, const __m256& y) const + { + return remainder256_ps(x, y); + } +#if __AVX512F__ + __m512 func_pack16(const __m512& x, const __m512& y) const + { + return remainder512_ps(x, y); + } +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ +}; + } // namespace BinaryOp_x86_functor static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr, int aw, int bw, int ap, int bp, int op_type) @@ -809,6 +835,7 @@ static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr, if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); // should never reach here } diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h index b7cecfb8123..72727e407d7 100644 --- a/src/layer/x86/sse_mathfun.h +++ b/src/layer/x86/sse_mathfun.h @@ -32,6 +32,8 @@ #ifndef SSE_MATHFUN_H #define SSE_MATHFUN_H +#include +#include #define USE_SSE2 1 #include @@ -1157,4 +1159,12 @@ static NCNN_FORCEINLINE __m128 abs_ps(__m128 inputs) return _mm_andnot_ps(magic_negative_zero, inputs); } +static NCNN_FORCEINLINE __m128 remainder_ps(__m128 x, __m128 y) +{ + const __m128 div_result = _mm_div_ps(x, y); + const __m128 round_result = _mm_round_ps(div_result, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); + const __m128 mul_result = _mm_mul_ps(round_result, y); + return _mm_sub_ps(x, mul_result); +} + #endif // SSE_MATHFUN_H From 460b4d41277c006beb638e7a6927a4859397b511 Mon Sep 17 00:00:00 2001 From: Fisher Date: Fri, 4 Aug 2023 23:13:31 +0800 Subject: [PATCH 04/23] Remove headers --- src/layer/x86/sse_mathfun.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h index 72727e407d7..b9352fec6c4 100644 --- a/src/layer/x86/sse_mathfun.h +++ b/src/layer/x86/sse_mathfun.h @@ -32,8 +32,6 @@ #ifndef SSE_MATHFUN_H #define SSE_MATHFUN_H -#include -#include #define USE_SSE2 1 #include From 5ab6be7036028594896f6826f96adb24cf9d13dc Mon Sep 17 00:00:00 2001 From: Fisher Date: Tue, 8 Aug 2023 22:16:40 +0800 Subject: [PATCH 05/23] Use sse4 --- src/CMakeLists.txt | 2 +- src/layer/x86/sse_mathfun.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cb7be1f6e9c..07496791a96 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -345,7 +345,7 @@ if(NCNN_TARGET_ARCH STREQUAL "x86") if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")) target_compile_options(ncnn PRIVATE /arch:SSE2 /D__SSE2__) else() - target_compile_options(ncnn PRIVATE -msse2 -msse) + target_compile_options(ncnn PRIVATE -msse4.1 -msse2 -msse) if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") target_compile_options(ncnn PRIVATE -msimd128) endif() diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h index b9352fec6c4..cc34d7f7989 100644 --- a/src/layer/x86/sse_mathfun.h +++ b/src/layer/x86/sse_mathfun.h @@ -33,6 +33,7 @@ #define SSE_MATHFUN_H #define USE_SSE2 1 +#define USE_SSE4 1 #include #include @@ -57,6 +58,10 @@ typedef __m128i v4si; // vector of 4 int (sse2) typedef __m64 v2si; // vector of 2 int (mmx) #endif +#ifdef USE_SSE4 +#include +#endif + /* declare some SSE constants -- why can't I figure a better way to do that? */ #define _PS_CONST(Name, Val) \ static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = {Val, Val, Val, Val} From a8d4d97829f2f6979a6f6deba07288ffe3f6052a Mon Sep 17 00:00:00 2001 From: Fisher Date: Tue, 8 Aug 2023 22:18:48 +0800 Subject: [PATCH 06/23] Try support remainder in vulkan --- src/layer/vulkan/shader/binaryop.comp | 1 + src/layer/vulkan/shader/binaryop_broadcast.comp | 1 + src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp | 1 + src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp | 5 +++++ src/layer/vulkan/shader/binaryop_broadcast_pack4.comp | 1 + src/layer/vulkan/shader/binaryop_broadcast_pack8.comp | 5 +++++ src/layer/vulkan/shader/binaryop_pack4.comp | 1 + src/layer/vulkan/shader/binaryop_pack8.comp | 5 +++++ 8 files changed, 20 insertions(+) diff --git a/src/layer/vulkan/shader/binaryop.comp b/src/layer/vulkan/shader/binaryop.comp index 18f566a2a72..02c986817a3 100644 --- a/src/layer/vulkan/shader/binaryop.comp +++ b/src/layer/vulkan/shader/binaryop.comp @@ -137,6 +137,7 @@ void main() if (op_type == 10) res = atan(v1, v2); if (op_type == 11) res = atan(v2, v1); #endif + if (op_type == 12) res = mod(v1, v2); #if NCNN_image_shader image3d_st1(top_blob_3d, ivec3(gx, gy, gz), res); diff --git a/src/layer/vulkan/shader/binaryop_broadcast.comp b/src/layer/vulkan/shader/binaryop_broadcast.comp index 732e3f50b0a..d15d8e855ca 100644 --- a/src/layer/vulkan/shader/binaryop_broadcast.comp +++ b/src/layer/vulkan/shader/binaryop_broadcast.comp @@ -199,6 +199,7 @@ void main() if (op_type == 10) res = atan(v1, v2); if (op_type == 11) res = atan(v2, v1); #endif + if (op_type == 12) res = mod(v1, v2); #if NCNN_image_shader image3d_st1(top_blob_3d, ivec3(gx, gy, gz), res); diff --git a/src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp b/src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp index ced3933db4a..6279153b800 100644 --- a/src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp +++ b/src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp @@ -130,6 +130,7 @@ void main() if (op_type == 10) res = atan(v1, v2); if (op_type == 11) res = atan(v2, v1); #endif + if (op_type == 12) res = mod(v1, v2); #if NCNN_image_shader image3d_st4(top_blob_3d, ivec3(gx, gy, gz), res); diff --git a/src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp b/src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp index 963f9c0030c..9aa70946404 100644 --- a/src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp +++ b/src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp @@ -187,6 +187,11 @@ void main() res[1] = atan(v2[1], v1[1]); #endif } + if (op_type == 12) + { + res[0] = mod(v1[0], v2[0]); + res[1] = mod(v1[1], v2[1]); + } #if NCNN_image_shader image3d_st8(top_blob_3d, ivec3(gx, gy, gz), res); diff --git a/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp b/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp index a0f0376b09e..bcba406a748 100644 --- a/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp +++ b/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp @@ -199,6 +199,7 @@ void main() if (op_type == 10) res = atan(v1, v2); if (op_type == 11) res = atan(v2, v1); #endif + if (op_type == 12) res = mod(v1, v2); #if NCNN_image_shader image3d_st4(top_blob_3d, ivec3(gx, gy, gz), res); diff --git a/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp b/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp index b9e7d492bb9..f5cbd166b34 100644 --- a/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp +++ b/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp @@ -253,6 +253,11 @@ void main() res[1] = atan(v2[1], v1[1]); #endif } + if (op_type == 12) + { + res[0] = mod(v1[0], v2[0]); + res[1] = mod(v1[1], v2[1]); + } #if NCNN_image_shader image3d_st8(top_blob_3d, ivec3(gx, gy, gz), res); diff --git a/src/layer/vulkan/shader/binaryop_pack4.comp b/src/layer/vulkan/shader/binaryop_pack4.comp index 0189253fb3d..a1df2479aeb 100644 --- a/src/layer/vulkan/shader/binaryop_pack4.comp +++ b/src/layer/vulkan/shader/binaryop_pack4.comp @@ -128,6 +128,7 @@ void main() if (op_type == 10) res = atan(v1, v2); if (op_type == 11) res = atan(v2, v1); #endif + if (op_type == 12) res = mod(v1, v2); #if NCNN_image_shader image3d_st4(top_blob_3d, ivec3(gx, gy, gz), res); diff --git a/src/layer/vulkan/shader/binaryop_pack8.comp b/src/layer/vulkan/shader/binaryop_pack8.comp index 9fe54902bd5..924a01f2534 100644 --- a/src/layer/vulkan/shader/binaryop_pack8.comp +++ b/src/layer/vulkan/shader/binaryop_pack8.comp @@ -183,6 +183,11 @@ void main() res[1] = atan(v2[1], v1[1]); #endif } + if (op_type == 12) + { + res[0] = mod(v1[0], v2[0]); + res[1] = mod(v1[1], v2[1]); + } #if NCNN_image_shader image3d_st8(top_blob_3d, ivec3(gx, gy, gz), res); From 184e4efcbaf100a83e43f5fb03eb20e6b4d151ff Mon Sep 17 00:00:00 2001 From: Fisher Date: Tue, 8 Aug 2023 22:19:09 +0800 Subject: [PATCH 07/23] Try support remainder in riscv --- src/layer/riscv/binaryop_riscv.cpp | 3 +++ src/layer/riscv/rvv_mathfun.h | 19 +++++++++++++++++++ src/layer/riscv/rvv_mathfun_fp16s.h | 19 +++++++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/src/layer/riscv/binaryop_riscv.cpp b/src/layer/riscv/binaryop_riscv.cpp index 67ff7ce99f1..31a6af6654d 100644 --- a/src/layer/riscv/binaryop_riscv.cpp +++ b/src/layer/riscv/binaryop_riscv.cpp @@ -295,6 +295,7 @@ MAKE_FUNCTION(binary_op_rdiv, y / x, vfdiv_vv_f32m8(y, x, vl), vfrdiv_vf_f32m8(x MAKE_FUNCTION(binary_op_rpow, (float)pow(y, x), pow_ps(y, x, vl), pow_ps(vfmv_v_f_f32m8(y, vl), x, vl), pow_ps(y, vfmv_v_f_f32m8(x, vl), vl)) MAKE_FUNCTION(binary_op_atan2, (float)atan2(x, y), atan2_ps(x, y, vl), atan2_ps(x, vfmv_v_f_f32m8(y, vl), vl), atan2_ps(vfmv_v_f_f32m8(x, vl), y, vl)) MAKE_FUNCTION(binary_op_ratan2, (float)atan2(y, x), atan2_ps(y, x, vl), atan2_ps(vfmv_v_f_f32m8(y, vl), x, vl), atan2_ps(y, vfmv_v_f_f32m8(x, vl), vl)) +MAKE_FUNCTION(binary_op_remainder, (float)remainderf(x, y), remainder_ps(x, y, vl), remainder_ps(x, vfmv_v_f_f32m8(y, vl), vl), remainder_ps(vfmv_v_f_f32m8(x, vl), y, vl)) // *INDENT-ON* // clang-format on @@ -318,6 +319,7 @@ static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr, if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); // should never reach here } @@ -846,6 +848,7 @@ static void binary_op_vector_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16 if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); // should never reach here } diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h index ebf980060a7..02b6a7417db 100644 --- a/src/layer/riscv/rvv_mathfun.h +++ b/src/layer/riscv/rvv_mathfun.h @@ -566,4 +566,23 @@ _RVV_FLOAT32_ATAN2_OP(2, 16) _RVV_FLOAT32_ATAN2_OP(4, 8) _RVV_FLOAT32_ATAN2_OP(8, 4) +#define _RVV_FLOAT32_REMAINDER_OP(LMUL, MLEN) \ + static inline vfloat32m##LMUL##_t remainder_ps(vfloat32m##LMUL##_t x, vfloat32m##LMUL##_t y, size_t vl) \ + { \ + std::vector tmpx(vl); \ + std::vector tmpy(vl); \ + vse32_v_f32m##LMUL(tmpx.data(), x, vl); \ + vse32_v_f32m##LMUL(tmpy.data(), y, vl); \ + for (size_t i = 0; i < vl; i++) \ + { \ + tmpx[i] = remainderf(tmpx[i], tmpy[i]); \ + } \ + return vle32_v_f32m##LMUL(tmpx.data(), vl); \ + } + +_RVV_FLOAT32_REMAINDER_OP(1, 32) +_RVV_FLOAT32_REMAINDER_OP(2, 16) +_RVV_FLOAT32_REMAINDER_OP(4, 8) +_RVV_FLOAT32_REMAINDER_OP(8, 4) + #endif // RVV_MATHFUN_H diff --git a/src/layer/riscv/rvv_mathfun_fp16s.h b/src/layer/riscv/rvv_mathfun_fp16s.h index 47671fe21f0..5280ba584c9 100644 --- a/src/layer/riscv/rvv_mathfun_fp16s.h +++ b/src/layer/riscv/rvv_mathfun_fp16s.h @@ -402,4 +402,23 @@ _RVV_FLOAT16_ATAN2_OP(2, 16) _RVV_FLOAT16_ATAN2_OP(4, 8) _RVV_FLOAT16_ATAN2_OP(8, 4) +#define _RVV_FLOAT16_REMAINDER_OP(LMUL, MLEN) \ + static inline vfloat16m##LMUL##_t remainder_ps(vfloat16m##LMUL##_t x, vfloat16m##LMUL##_t y, size_t vl) \ + { \ + std::vector<__fp16> tmpx(vl); \ + std::vector<__fp16> tmpy(vl); \ + vse16_v_f16m##LMUL(tmpx.data(), x, vl); \ + vse16_v_f16m##LMUL(tmpy.data(), y, vl); \ + for (size_t i = 0; i < vl; i++) \ + { \ + tmpx[i] = (__fp16)remainderf((float)tmpx[i], (float)tmpy[i]); \ + } \ + return vle16_v_f16m##LMUL(tmpx.data(), vl); \ + } + +_RVV_FLOAT16_REMAINDER_OP(1, 32) +_RVV_FLOAT16_REMAINDER_OP(2, 16) +_RVV_FLOAT16_REMAINDER_OP(4, 8) +_RVV_FLOAT16_REMAINDER_OP(8, 4) + #endif // RVV_MATHFUN_FP16S_H From a10197b1085891fab9fe5272c9564dc0189d7ab0 Mon Sep 17 00:00:00 2001 From: Fisher Date: Tue, 29 Aug 2023 11:23:41 +0800 Subject: [PATCH 08/23] Try support remainder in mips --- src/layer/mips/binaryop_mips.cpp | 2 ++ src/layer/mips/msa_mathfun.h | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/src/layer/mips/binaryop_mips.cpp b/src/layer/mips/binaryop_mips.cpp index ebce7743708..d10641a3785 100644 --- a/src/layer/mips/binaryop_mips.cpp +++ b/src/layer/mips/binaryop_mips.cpp @@ -314,6 +314,7 @@ MAKE_FUNCTION(binary_op_rdiv, y / x, __msa_fdiv_w(y, x)) MAKE_FUNCTION(binary_op_rpow, (float)pow(y, x), pow_ps(y, x)) MAKE_FUNCTION(binary_op_atan2, (float)atan2(x, y), atan2_ps(x, y)) MAKE_FUNCTION(binary_op_ratan2, (float)atan2(y, x), atan2_ps(y, x)) +MAKE_FUNCTION(binary_op_remainder, remainder(x, y), remainder_ps(x, y)) // *INDENT-ON* // clang-format on @@ -337,6 +338,7 @@ static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr, if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); // should never reach here } diff --git a/src/layer/mips/msa_mathfun.h b/src/layer/mips/msa_mathfun.h index cab71acbc6b..a005b95564b 100644 --- a/src/layer/mips/msa_mathfun.h +++ b/src/layer/mips/msa_mathfun.h @@ -267,4 +267,17 @@ static inline v4f32 atan2_ps(v4f32 a, v4f32 b) return (v4f32)__msa_ld_w(tmpx, 0); } +static inline v4f32 remainder_ps(v4f32 x, v4f32 y) +{ + float tmpx[4]; + float tmpy[4]; + __msa_st_w(x, tmpx, 0); + __msa_st_w(y, tmpy, 0); + tmpx[0] = remainder(tmpx[0], tmpy[0]); + tmpx[1] = remainder(tmpx[1], tmpy[1]); + tmpx[2] = remainder(tmpx[2], tmpy[2]); + tmpx[3] = remainder(tmpx[3], tmpy[3]); + return __msa_ld_w(tmpx, 0); +} + #endif // MSA_MATHFUN_H From 42950e20765d749bc339266538f31e687b048da5 Mon Sep 17 00:00:00 2001 From: Fisher Date: Tue, 29 Aug 2023 11:24:06 +0800 Subject: [PATCH 09/23] Try support remainder in loongarch --- src/layer/loongarch/binaryop_loongarch.cpp | 2 ++ src/layer/loongarch/lsx_mathfun.h | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/src/layer/loongarch/binaryop_loongarch.cpp b/src/layer/loongarch/binaryop_loongarch.cpp index c5f64c083cd..3d4f191dcda 100644 --- a/src/layer/loongarch/binaryop_loongarch.cpp +++ b/src/layer/loongarch/binaryop_loongarch.cpp @@ -314,6 +314,7 @@ MAKE_FUNCTION(binary_op_rdiv, y / x, __lsx_vfdiv_s(y, x)) MAKE_FUNCTION(binary_op_rpow, (float)pow(y, x), pow_ps(y, x)) MAKE_FUNCTION(binary_op_atan2, (float)atan2(x, y), atan2_ps(x, y)) MAKE_FUNCTION(binary_op_ratan2, (float)atan2(y, x), atan2_ps(y, x)) +MAKE_FUNCTION(binary_op_remainder, remainder(x, y), remainder_ps(x, y)) // *INDENT-ON* // clang-format on @@ -337,6 +338,7 @@ static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr, if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); // should never reach here } diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h index 194f63bedc3..9a67473fcef 100644 --- a/src/layer/loongarch/lsx_mathfun.h +++ b/src/layer/loongarch/lsx_mathfun.h @@ -269,4 +269,17 @@ static inline __m128 atan2_ps(__m128 a, __m128 b) return (__m128)__lsx_vld(tmpx, 0); } +static inline __m128 remainder_ps(__m128 x, __m128 y) +{ + float tmpx[4]; + float tmpy[4]; + __lsx_vst(x, tmpx, 0); + __lsx_vst(y, tmpy, 0); + tmpx[0] = remainder(tmpx[0], tmpy[0]); + tmpx[1] = remainder(tmpx[1], tmpy[1]); + tmpx[2] = remainder(tmpx[2], tmpy[2]); + tmpx[3] = remainder(tmpx[3], tmpy[3]); + return __lsx_vld(tmpx, 0); +} + #endif // LSX_MATHFUN_H From 1fd5705ba86a258b25e67228f4869de922f8c4fe Mon Sep 17 00:00:00 2001 From: Fisher Date: Tue, 29 Aug 2023 11:24:26 +0800 Subject: [PATCH 10/23] Try support remainder in arm --- src/layer/arm/binaryop_arm.cpp | 2 ++ src/layer/arm/binaryop_arm_asimdhp.cpp | 2 ++ src/layer/arm/neon_mathfun.h | 11 +++++++++++ 3 files changed, 15 insertions(+) diff --git a/src/layer/arm/binaryop_arm.cpp b/src/layer/arm/binaryop_arm.cpp index c0d2b9bbbb1..f70563fbb2b 100644 --- a/src/layer/arm/binaryop_arm.cpp +++ b/src/layer/arm/binaryop_arm.cpp @@ -287,6 +287,7 @@ MAKE_FUNCTION(binary_op_rdiv, y / x, div_ps(y, x)) MAKE_FUNCTION(binary_op_rpow, (float)powf(y, x), pow_ps(y, x)) MAKE_FUNCTION(binary_op_atan2, (float)atan2f(x, y), atan2_ps(x, y)) MAKE_FUNCTION(binary_op_ratan2, (float)atan2f(y, x), atan2_ps(y, x)) +MAKE_FUNCTION(binary_op_remainder, remainderf(x, y), remainder_ps(x, y)) // *INDENT-ON* // clang-format on @@ -310,6 +311,7 @@ static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr, if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector(ptr, ptr1, outptr, aw, bw, ap, bp); // should never reach here } diff --git a/src/layer/arm/binaryop_arm_asimdhp.cpp b/src/layer/arm/binaryop_arm_asimdhp.cpp index 8dc0db288f4..95ccc8df52c 100644 --- a/src/layer/arm/binaryop_arm_asimdhp.cpp +++ b/src/layer/arm/binaryop_arm_asimdhp.cpp @@ -330,6 +330,7 @@ MAKE_FUNCTION(binary_op_rdiv_fp16s, y / x, vdiv_f16(y, x), vdivq_f16(y, x)) MAKE_FUNCTION(binary_op_rpow_fp16s, (__fp16)powf(y, x), vcvt_f16_f32(pow_ps(vcvt_f32_f16(y), vcvt_f32_f16(x))), vcombine_f16(vcvt_f16_f32(pow_ps(vcvt_f32_f16(vget_low_f16(y)), vcvt_f32_f16(vget_low_f16(x)))), vcvt_f16_f32(pow_ps(vcvt_f32_f16(vget_high_f16(y)), vcvt_f32_f16(vget_high_f16(x)))))) MAKE_FUNCTION(binary_op_atan2_fp16s, (__fp16)atan2f(x, y), vcvt_f16_f32(atan2_ps(vcvt_f32_f16(x), vcvt_f32_f16(y))), vcombine_f16(vcvt_f16_f32(atan2_ps(vcvt_f32_f16(vget_low_f16(x)), vcvt_f32_f16(vget_low_f16(y)))), vcvt_f16_f32(atan2_ps(vcvt_f32_f16(vget_high_f16(x)), vcvt_f32_f16(vget_high_f16(y)))))) MAKE_FUNCTION(binary_op_ratan2_fp16s, (__fp16)atan2f(y, x), vcvt_f16_f32(atan2_ps(vcvt_f32_f16(y), vcvt_f32_f16(x))), vcombine_f16(vcvt_f16_f32(atan2_ps(vcvt_f32_f16(vget_low_f16(y)), vcvt_f32_f16(vget_low_f16(x)))), vcvt_f16_f32(atan2_ps(vcvt_f32_f16(vget_high_f16(y)), vcvt_f32_f16(vget_high_f16(x)))))) +MAKE_FUNCTION(binary_op_remainder_fp16s, (__fp16)remainderf(x, y), vcvt_f16_f32(remainder_ps(vcvt_f32_f16(x), vcvt_f32_f16(y))), vcombine_f16(vcvt_f16_f32(remainder_ps(vcvt_f32_f16(vget_low_f16(x)), vcvt_f32_f16(vget_low_f16(y)))), vcvt_f16_f32(remainder_ps(vcvt_f32_f16(vget_high_f16(x)), vcvt_f32_f16(vget_high_f16(y)))))) // *INDENT-ON* // clang-format on @@ -353,6 +354,7 @@ static void binary_op_vector_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16 if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); // should never reach here } diff --git a/src/layer/arm/neon_mathfun.h b/src/layer/arm/neon_mathfun.h index 78db901d904..e905a516f23 100644 --- a/src/layer/arm/neon_mathfun.h +++ b/src/layer/arm/neon_mathfun.h @@ -424,5 +424,16 @@ static inline float32x4_t atan2_ps(float32x4_t a, float32x4_t b) return vld1q_f32(tmpx); } +static inline float32x4_t remainder_ps(float32x4_t x, float32x4_t y) +{ + float tmpx[4]; + float tmpy[4]; + vst1q_f32(tmpx, x); + vst1q_f32(tmpy, y); + for (int i = 0; i < 4; i++) + tmpx[i] = remainderf(tmpx[i], tmpy[i]); + return vld1q_f32(tmpx); +} + #include "neon_mathfun_tanh.h" #endif // NEON_MATHFUN_H From 0f3c34fc33255348704d08850a69672e26346959 Mon Sep 17 00:00:00 2001 From: Fisher Date: Tue, 12 Sep 2023 21:10:43 +0800 Subject: [PATCH 11/23] Change tests/test_binaryop OP_TYPE_MAX from 12 to 13 --- tests/test_binaryop.cpp | 2 +- tests/test_binaryop_1.cpp | 2 +- tests/test_binaryop_2.cpp | 2 +- tests/test_binaryop_3.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_binaryop.cpp b/tests/test_binaryop.cpp index beb746388f2..9e257858545 100644 --- a/tests/test_binaryop.cpp +++ b/tests/test_binaryop.cpp @@ -15,7 +15,7 @@ #include "layer/binaryop.h" #include "testutil.h" -#define OP_TYPE_MAX 12 +#define OP_TYPE_MAX 13 static int op_type = 0; diff --git a/tests/test_binaryop_1.cpp b/tests/test_binaryop_1.cpp index 4ce81714111..ad2c434075d 100644 --- a/tests/test_binaryop_1.cpp +++ b/tests/test_binaryop_1.cpp @@ -15,7 +15,7 @@ #include "layer/binaryop.h" #include "testutil.h" -#define OP_TYPE_MAX 12 +#define OP_TYPE_MAX 13 static int op_type = 0; diff --git a/tests/test_binaryop_2.cpp b/tests/test_binaryop_2.cpp index f0730b1436e..b53f3e3259e 100644 --- a/tests/test_binaryop_2.cpp +++ b/tests/test_binaryop_2.cpp @@ -15,7 +15,7 @@ #include "layer/binaryop.h" #include "testutil.h" -#define OP_TYPE_MAX 12 +#define OP_TYPE_MAX 13 static int op_type = 0; diff --git a/tests/test_binaryop_3.cpp b/tests/test_binaryop_3.cpp index d5e58ccc33e..42e6ed53ddc 100644 --- a/tests/test_binaryop_3.cpp +++ b/tests/test_binaryop_3.cpp @@ -15,7 +15,7 @@ #include "layer/binaryop.h" #include "testutil.h" -#define OP_TYPE_MAX 12 +#define OP_TYPE_MAX 13 static int op_type = 0; From 66887d94427fc6b1e19d04e184f9d45ff1d9baa4 Mon Sep 17 00:00:00 2001 From: Fisher Date: Wed, 13 Sep 2023 19:43:33 +0800 Subject: [PATCH 12/23] Fix build error on riscv --- src/layer/riscv/binaryop_riscv.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/layer/riscv/binaryop_riscv.cpp b/src/layer/riscv/binaryop_riscv.cpp index 31a6af6654d..efb87f0b245 100644 --- a/src/layer/riscv/binaryop_riscv.cpp +++ b/src/layer/riscv/binaryop_riscv.cpp @@ -825,6 +825,7 @@ MAKE_FUNCTION(binary_op_rdiv_fp16s, y / x, vfdiv_vv_f16m8(y, x, vl), vfrdiv_vf_f MAKE_FUNCTION(binary_op_rpow_fp16s, (__fp16)pow((float)y, (float)x), pow_ps(y, x, vl), pow_ps(vfmv_v_f_f16m8(y, vl), x, vl), pow_ps(y, vfmv_v_f_f16m8(x, vl), vl)) MAKE_FUNCTION(binary_op_atan2_fp16s, (__fp16)atan2((float)x, (float)y), atan2_ps(x, y, vl), atan2_ps(x, vfmv_v_f_f16m8(y, vl), vl), atan2_ps(vfmv_v_f_f16m8(x, vl), y, vl)) MAKE_FUNCTION(binary_op_ratan2_fp16s, (__fp16)atan2((float)y, (float)x), atan2_ps(y, x, vl), atan2_ps(vfmv_v_f_f16m8(y, vl), x, vl), atan2_ps(y, vfmv_v_f_f16m8(x, vl), vl)) +MAKE_FUNCTION(binary_op_remainder_fp16s, (__fp16)remainderf((float)x, (float)y), remainder_ps(x, y, vl), remainder_ps(x, vfmv_v_f_f32m8(y, vl), vl), remainder_ps(vfmv_v_f_f32m8(x, vl), y, vl)) // *INDENT-ON* // clang-format on From 5df5c8dd8c537897a91d6b04a1384a2a9aa7ae37 Mon Sep 17 00:00:00 2001 From: Fisher Date: Wed, 13 Sep 2023 19:44:06 +0800 Subject: [PATCH 13/23] Add pnnx convertor --- tools/pnnx/src/pass_ncnn/expand_expression.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/pnnx/src/pass_ncnn/expand_expression.cpp b/tools/pnnx/src/pass_ncnn/expand_expression.cpp index f6022be665f..7055f2af4e0 100644 --- a/tools/pnnx/src/pass_ncnn/expand_expression.cpp +++ b/tools/pnnx/src/pass_ncnn/expand_expression.cpp @@ -199,7 +199,7 @@ static std::string expand_expression(Graph& graph, const Operator* op, int& pnnx if (t == "div") op_binary->params["0"] = 3; if (t == "floor_divide") fprintf(stderr, "BinaryOp floor_divide not supported yet\n"); // TODO if (t == "fmod") fprintf(stderr, "BinaryOp fmod not supported yet\n"); // TODO - if (t == "remainder") fprintf(stderr, "BinaryOp remainder not supported yet\n"); // TODO + if (t == "remainder") op_binary->params["0"] = 12; if (t == "pow") op_binary->params["0"] = 6; if (t == "atan2") op_binary->params["0"] = 10; From a3e022fb2ea5461bb1efa7050c27a7cf52d190c5 Mon Sep 17 00:00:00 2001 From: Fisher Date: Wed, 13 Sep 2023 19:44:47 +0800 Subject: [PATCH 14/23] Add remainder python unittest --- tools/pnnx/tests/ncnn/CMakeLists.txt | 1 + tools/pnnx/tests/ncnn/test_torch_remainder.py | 61 +++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 tools/pnnx/tests/ncnn/test_torch_remainder.py diff --git a/tools/pnnx/tests/ncnn/CMakeLists.txt b/tools/pnnx/tests/ncnn/CMakeLists.txt index 945576bfaf6..0dee6c2f52c 100644 --- a/tools/pnnx/tests/ncnn/CMakeLists.txt +++ b/tools/pnnx/tests/ncnn/CMakeLists.txt @@ -175,6 +175,7 @@ pnnx_ncnn_add_test(torch_log10) pnnx_ncnn_add_test(torch_neg) pnnx_ncnn_add_test(torch_pow) pnnx_ncnn_add_test(torch_reciprocal) +pnnx_ncnn_add_test(torch_remainder) pnnx_ncnn_add_test(torch_round) pnnx_ncnn_add_test(torch_rsqrt) pnnx_ncnn_add_test(torch_sin) diff --git a/tools/pnnx/tests/ncnn/test_torch_remainder.py b/tools/pnnx/tests/ncnn/test_torch_remainder.py new file mode 100644 index 00000000000..ffa06dba25f --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_remainder.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + out0 = torch.remainder(x, y) + out1 = torch.remainder(y, y) + out2 = torch.remainder(z, torch.ones_like(z) + 0.5) + return out0, out1, out2 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(3, 16) + z = torch.rand(5, 9, 3) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_remainder.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_remainder.pt inputshape=[3,16],[3,16],[5,9,3]") + + # ncnn inference + import test_torch_remainder_ncnn + b = test_torch_remainder_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) From 21e6ca0834afa4120f91fe903b9615b6695a14ea Mon Sep 17 00:00:00 2001 From: Fisher Date: Thu, 21 Sep 2023 13:57:02 +0000 Subject: [PATCH 15/23] Try fix result error on x86 --- src/layer/binaryop.cpp | 6 +++++- src/layer/x86/avx512_mathfun.h | 2 +- src/layer/x86/avx_mathfun.h | 2 +- src/layer/x86/binaryop_x86.cpp | 5 ++++- src/layer/x86/sse_mathfun.h | 2 +- 5 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/layer/binaryop.cpp b/src/layer/binaryop.cpp index 80d0588c759..098687e589f 100644 --- a/src/layer/binaryop.cpp +++ b/src/layer/binaryop.cpp @@ -243,7 +243,11 @@ struct binary_op_remainder { float operator()(const float& x, const float& y) const { - return (float)remainderf(x, y); + float div_result = x / y; + float round_result = roundf(div_result); + float res = x - y * round_result; + return res; + // return (float)remainderf(x, y); } }; diff --git a/src/layer/x86/avx512_mathfun.h b/src/layer/x86/avx512_mathfun.h index 725643d9e2d..cc96a02273e 100644 --- a/src/layer/x86/avx512_mathfun.h +++ b/src/layer/x86/avx512_mathfun.h @@ -859,7 +859,7 @@ static NCNN_FORCEINLINE __m512 abs512_ps(__m512 x) static NCNN_FORCEINLINE __m512 remainder512_ps(__m512 x, __m512 y) { const __m512 round_div_result = _mm512_div_round_ps(x, y, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); - const __m512 mul_result = _mm512_mul_ps(round_div_result, y); + const __m512 mul_result = _mm512_mul_ps(y, round_div_result); return _mm512_sub_ps(x, mul_result); } diff --git a/src/layer/x86/avx_mathfun.h b/src/layer/x86/avx_mathfun.h index 24788cdf7a1..a4debeeb3ce 100644 --- a/src/layer/x86/avx_mathfun.h +++ b/src/layer/x86/avx_mathfun.h @@ -1091,7 +1091,7 @@ static NCNN_FORCEINLINE __m256 remainder256_ps(__m256 x, __m256 y) { const __m256 div_result = _mm256_div_ps(x, y); const __m256 round_result = _mm256_round_ps(div_result, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); - const __m256 mul_result = _mm256_mul_ps(round_result, y); + const __m256 mul_result = _mm256_mul_ps(y, round_result); return _mm256_sub_ps(x, mul_result); } diff --git a/src/layer/x86/binaryop_x86.cpp b/src/layer/x86/binaryop_x86.cpp index 5123767b23a..d438e6920b1 100644 --- a/src/layer/x86/binaryop_x86.cpp +++ b/src/layer/x86/binaryop_x86.cpp @@ -795,7 +795,10 @@ struct binary_op_remainder { float func(const float& x, const float& y) const { - return (float)remainderf(x, y); + float div_result = x / y; + float round_result = roundf(div_result); + float res = x - y * round_result; + return res; } #if __SSE2__ __m128 func_pack4(const __m128& x, const __m128& y) const diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h index cc34d7f7989..2c7ef13386d 100644 --- a/src/layer/x86/sse_mathfun.h +++ b/src/layer/x86/sse_mathfun.h @@ -1166,7 +1166,7 @@ static NCNN_FORCEINLINE __m128 remainder_ps(__m128 x, __m128 y) { const __m128 div_result = _mm_div_ps(x, y); const __m128 round_result = _mm_round_ps(div_result, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); - const __m128 mul_result = _mm_mul_ps(round_result, y); + const __m128 mul_result = _mm_mul_ps(y, round_result); return _mm_sub_ps(x, mul_result); } From cea002681c744c5f8e43838885aa66352962343a Mon Sep 17 00:00:00 2001 From: Fisher Date: Thu, 21 Sep 2023 14:10:26 +0000 Subject: [PATCH 16/23] Fix args in binaryop.cpp --- src/layer/binaryop.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layer/binaryop.cpp b/src/layer/binaryop.cpp index 098687e589f..b220beb2da5 100644 --- a/src/layer/binaryop.cpp +++ b/src/layer/binaryop.cpp @@ -265,7 +265,7 @@ static void binary_op_broadcast(const Mat& a, const Mat& b, Mat& c, int op_type, if (op_type == BinaryOp::Operation_RPOW) return binary_op_broadcast(b, a, c, opt); if (op_type == BinaryOp::Operation_ATAN2) return binary_op_broadcast(a, b, c, opt); if (op_type == BinaryOp::Operation_RATAN2) return binary_op_broadcast(b, a, c, opt); - if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_broadcast(b, a, c, opt); + if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_broadcast(a, b, c, opt); // should never reach here } From 22ce01f2138136c95e48695e7a0318e66459f4a9 Mon Sep 17 00:00:00 2001 From: Fisher Date: Tue, 17 Oct 2023 09:14:51 +0000 Subject: [PATCH 17/23] Fix compute error for remainder on x86 --- src/CMakeLists.txt | 2 +- src/layer/binaryop.cpp | 9 ++++----- src/layer/x86/avx512_mathfun.h | 5 +++-- src/layer/x86/avx_mathfun.h | 4 ++-- src/layer/x86/binaryop_x86.cpp | 8 ++++---- src/layer/x86/sse_mathfun.h | 14 +++++++------- 6 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 07496791a96..cb7be1f6e9c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -345,7 +345,7 @@ if(NCNN_TARGET_ARCH STREQUAL "x86") if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")) target_compile_options(ncnn PRIVATE /arch:SSE2 /D__SSE2__) else() - target_compile_options(ncnn PRIVATE -msse4.1 -msse2 -msse) + target_compile_options(ncnn PRIVATE -msse2 -msse) if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") target_compile_options(ncnn PRIVATE -msimd128) endif() diff --git a/src/layer/binaryop.cpp b/src/layer/binaryop.cpp index b220beb2da5..6f9afc55c1a 100644 --- a/src/layer/binaryop.cpp +++ b/src/layer/binaryop.cpp @@ -243,11 +243,10 @@ struct binary_op_remainder { float operator()(const float& x, const float& y) const { - float div_result = x / y; - float round_result = roundf(div_result); - float res = x - y * round_result; - return res; - // return (float)remainderf(x, y); + const float div_result = x / y; + const float floor_result = floorf(div_result); + const float mul_result = floor_result * y; + return x - mul_result; } }; diff --git a/src/layer/x86/avx512_mathfun.h b/src/layer/x86/avx512_mathfun.h index cc96a02273e..4bd8d074202 100644 --- a/src/layer/x86/avx512_mathfun.h +++ b/src/layer/x86/avx512_mathfun.h @@ -858,8 +858,9 @@ static NCNN_FORCEINLINE __m512 abs512_ps(__m512 x) static NCNN_FORCEINLINE __m512 remainder512_ps(__m512 x, __m512 y) { - const __m512 round_div_result = _mm512_div_round_ps(x, y, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); - const __m512 mul_result = _mm512_mul_ps(y, round_div_result); + const __m512 div_result = _mm512_div_ps(x, y); + const __m512 floor_result = _mm512_floor_ps(div_result); + const __m512 mul_result = _mm512_mul_ps(y, floor_result); return _mm512_sub_ps(x, mul_result); } diff --git a/src/layer/x86/avx_mathfun.h b/src/layer/x86/avx_mathfun.h index a4debeeb3ce..d3708f7fb1b 100644 --- a/src/layer/x86/avx_mathfun.h +++ b/src/layer/x86/avx_mathfun.h @@ -1090,8 +1090,8 @@ static NCNN_FORCEINLINE __m256 abs256_ps(__m256 x) static NCNN_FORCEINLINE __m256 remainder256_ps(__m256 x, __m256 y) { const __m256 div_result = _mm256_div_ps(x, y); - const __m256 round_result = _mm256_round_ps(div_result, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); - const __m256 mul_result = _mm256_mul_ps(y, round_result); + const __m256 floor_result = _mm256_floor_ps(div_result); + const __m256 mul_result = _mm256_mul_ps(y, floor_result); return _mm256_sub_ps(x, mul_result); } diff --git a/src/layer/x86/binaryop_x86.cpp b/src/layer/x86/binaryop_x86.cpp index d438e6920b1..c258f71a2fa 100644 --- a/src/layer/x86/binaryop_x86.cpp +++ b/src/layer/x86/binaryop_x86.cpp @@ -795,10 +795,10 @@ struct binary_op_remainder { float func(const float& x, const float& y) const { - float div_result = x / y; - float round_result = roundf(div_result); - float res = x - y * round_result; - return res; + const float div_result = x / y; + const float floor_result = floorf(div_result); + const float mul_result = floor_result * y; + return x - mul_result; } #if __SSE2__ __m128 func_pack4(const __m128& x, const __m128& y) const diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h index 2c7ef13386d..5ca090ef206 100644 --- a/src/layer/x86/sse_mathfun.h +++ b/src/layer/x86/sse_mathfun.h @@ -33,7 +33,6 @@ #define SSE_MATHFUN_H #define USE_SSE2 1 -#define USE_SSE4 1 #include #include @@ -58,10 +57,6 @@ typedef __m128i v4si; // vector of 4 int (sse2) typedef __m64 v2si; // vector of 2 int (mmx) #endif -#ifdef USE_SSE4 -#include -#endif - /* declare some SSE constants -- why can't I figure a better way to do that? */ #define _PS_CONST(Name, Val) \ static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = {Val, Val, Val, Val} @@ -1165,8 +1160,13 @@ static NCNN_FORCEINLINE __m128 abs_ps(__m128 inputs) static NCNN_FORCEINLINE __m128 remainder_ps(__m128 x, __m128 y) { const __m128 div_result = _mm_div_ps(x, y); - const __m128 round_result = _mm_round_ps(div_result, (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); - const __m128 mul_result = _mm_mul_ps(y, round_result); + // Need SSE4.1 + // const __m128 floor_result = _mm_floor_ps(div_result); + const __m128 trunc_result = _mm_cvtepi32_ps(_mm_cvttps_epi32(div_result)); + const __m128 cmp = _mm_cmplt_ps(div_result, trunc_result); + const __m128 one = _mm_set1_ps(1.0f); + const __m128 floor_result = _mm_sub_ps(trunc_result, _mm_and_ps(cmp, one)); + const __m128 mul_result = _mm_mul_ps(y, floor_result); return _mm_sub_ps(x, mul_result); } From 6097a1761b43a4a85bba19ea8ad4a3674fa332b8 Mon Sep 17 00:00:00 2001 From: Fisher Date: Tue, 17 Oct 2023 09:21:54 +0000 Subject: [PATCH 18/23] Fix compute error for remainder on vulkan --- src/layer/vulkan/shader/binaryop.comp | 2 +- src/layer/vulkan/shader/binaryop_broadcast.comp | 2 +- src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp | 2 +- src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp | 4 ++-- src/layer/vulkan/shader/binaryop_broadcast_pack4.comp | 2 +- src/layer/vulkan/shader/binaryop_broadcast_pack8.comp | 4 ++-- src/layer/vulkan/shader/binaryop_pack4.comp | 2 +- src/layer/vulkan/shader/binaryop_pack8.comp | 4 ++-- 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/layer/vulkan/shader/binaryop.comp b/src/layer/vulkan/shader/binaryop.comp index 02c986817a3..a6632e0ee1d 100644 --- a/src/layer/vulkan/shader/binaryop.comp +++ b/src/layer/vulkan/shader/binaryop.comp @@ -137,7 +137,7 @@ void main() if (op_type == 10) res = atan(v1, v2); if (op_type == 11) res = atan(v2, v1); #endif - if (op_type == 12) res = mod(v1, v2); + if (op_type == 12) res = v1 - floorf(v1 / v2) * v2; #if NCNN_image_shader image3d_st1(top_blob_3d, ivec3(gx, gy, gz), res); diff --git a/src/layer/vulkan/shader/binaryop_broadcast.comp b/src/layer/vulkan/shader/binaryop_broadcast.comp index d15d8e855ca..93edcb886ab 100644 --- a/src/layer/vulkan/shader/binaryop_broadcast.comp +++ b/src/layer/vulkan/shader/binaryop_broadcast.comp @@ -199,7 +199,7 @@ void main() if (op_type == 10) res = atan(v1, v2); if (op_type == 11) res = atan(v2, v1); #endif - if (op_type == 12) res = mod(v1, v2); + if (op_type == 12) res = v1 - floorf(v1 / v2) * v2; #if NCNN_image_shader image3d_st1(top_blob_3d, ivec3(gx, gy, gz), res); diff --git a/src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp b/src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp index 6279153b800..64cd537b71f 100644 --- a/src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp +++ b/src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp @@ -130,7 +130,7 @@ void main() if (op_type == 10) res = atan(v1, v2); if (op_type == 11) res = atan(v2, v1); #endif - if (op_type == 12) res = mod(v1, v2); + if (op_type == 12) res = v1 - floorf(v1 / v2) * v2; #if NCNN_image_shader image3d_st4(top_blob_3d, ivec3(gx, gy, gz), res); diff --git a/src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp b/src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp index 9aa70946404..f497e0726b2 100644 --- a/src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp +++ b/src/layer/vulkan/shader/binaryop_broadcast_pack1to8.comp @@ -189,8 +189,8 @@ void main() } if (op_type == 12) { - res[0] = mod(v1[0], v2[0]); - res[1] = mod(v1[1], v2[1]); + res[0] = v1[0] - floorf(v1[0] / v2[0]) * v2[0]; + res[1] = v1[1] - floorf(v1[1] / v2[1]) * v2[1]; } #if NCNN_image_shader diff --git a/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp b/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp index bcba406a748..4a9116dfc4f 100644 --- a/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp +++ b/src/layer/vulkan/shader/binaryop_broadcast_pack4.comp @@ -199,7 +199,7 @@ void main() if (op_type == 10) res = atan(v1, v2); if (op_type == 11) res = atan(v2, v1); #endif - if (op_type == 12) res = mod(v1, v2); + if (op_type == 12) res = v1 - floorf(v1 / v2) * v2; #if NCNN_image_shader image3d_st4(top_blob_3d, ivec3(gx, gy, gz), res); diff --git a/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp b/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp index f5cbd166b34..1b6ca17bcbb 100644 --- a/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp +++ b/src/layer/vulkan/shader/binaryop_broadcast_pack8.comp @@ -255,8 +255,8 @@ void main() } if (op_type == 12) { - res[0] = mod(v1[0], v2[0]); - res[1] = mod(v1[1], v2[1]); + res[0] = v1[0] - floorf(v1[0] / v2[0]) * v2[0]; + res[1] = v1[1] - floorf(v1[1] / v2[1]) * v2[1]; } #if NCNN_image_shader diff --git a/src/layer/vulkan/shader/binaryop_pack4.comp b/src/layer/vulkan/shader/binaryop_pack4.comp index a1df2479aeb..f86d53ac8b7 100644 --- a/src/layer/vulkan/shader/binaryop_pack4.comp +++ b/src/layer/vulkan/shader/binaryop_pack4.comp @@ -128,7 +128,7 @@ void main() if (op_type == 10) res = atan(v1, v2); if (op_type == 11) res = atan(v2, v1); #endif - if (op_type == 12) res = mod(v1, v2); + if (op_type == 12) res = v1 - floorf(v1 / v2) * v2; #if NCNN_image_shader image3d_st4(top_blob_3d, ivec3(gx, gy, gz), res); diff --git a/src/layer/vulkan/shader/binaryop_pack8.comp b/src/layer/vulkan/shader/binaryop_pack8.comp index 924a01f2534..1be2bd5fc18 100644 --- a/src/layer/vulkan/shader/binaryop_pack8.comp +++ b/src/layer/vulkan/shader/binaryop_pack8.comp @@ -185,8 +185,8 @@ void main() } if (op_type == 12) { - res[0] = mod(v1[0], v2[0]); - res[1] = mod(v1[1], v2[1]); + res[0] = v1[0] - floorf(v1[0] / v2[0]) * v2[0]; + res[1] = v1[1] - floorf(v1[1] / v2[1]) * v2[1]; } #if NCNN_image_shader From 77f908deaacd64c31b0a60af300f7f06306e50a3 Mon Sep 17 00:00:00 2001 From: Fisher Date: Tue, 17 Oct 2023 09:25:52 +0000 Subject: [PATCH 19/23] Fix compute error for remainder on loongarch --- src/layer/loongarch/binaryop_loongarch.cpp | 2 +- src/layer/loongarch/lsx_mathfun.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/layer/loongarch/binaryop_loongarch.cpp b/src/layer/loongarch/binaryop_loongarch.cpp index 3d4f191dcda..d7d3bb6af7b 100644 --- a/src/layer/loongarch/binaryop_loongarch.cpp +++ b/src/layer/loongarch/binaryop_loongarch.cpp @@ -314,7 +314,7 @@ MAKE_FUNCTION(binary_op_rdiv, y / x, __lsx_vfdiv_s(y, x)) MAKE_FUNCTION(binary_op_rpow, (float)pow(y, x), pow_ps(y, x)) MAKE_FUNCTION(binary_op_atan2, (float)atan2(x, y), atan2_ps(x, y)) MAKE_FUNCTION(binary_op_ratan2, (float)atan2(y, x), atan2_ps(y, x)) -MAKE_FUNCTION(binary_op_remainder, remainder(x, y), remainder_ps(x, y)) +MAKE_FUNCTION(binary_op_remainder, remainderf(x, y), remainder_ps(x, y)) // *INDENT-ON* // clang-format on diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h index 9a67473fcef..8c15d94adb9 100644 --- a/src/layer/loongarch/lsx_mathfun.h +++ b/src/layer/loongarch/lsx_mathfun.h @@ -275,10 +275,10 @@ static inline __m128 remainder_ps(__m128 x, __m128 y) float tmpy[4]; __lsx_vst(x, tmpx, 0); __lsx_vst(y, tmpy, 0); - tmpx[0] = remainder(tmpx[0], tmpy[0]); - tmpx[1] = remainder(tmpx[1], tmpy[1]); - tmpx[2] = remainder(tmpx[2], tmpy[2]); - tmpx[3] = remainder(tmpx[3], tmpy[3]); + tmpx[0] = remainderf(tmpx[0], tmpy[0]); + tmpx[1] = remainderf(tmpx[1], tmpy[1]); + tmpx[2] = remainderf(tmpx[2], tmpy[2]); + tmpx[3] = remainderf(tmpx[3], tmpy[3]); return __lsx_vld(tmpx, 0); } From 8f7770da4f50147b94a83d580096edc40ef2145e Mon Sep 17 00:00:00 2001 From: Fisher Date: Tue, 17 Oct 2023 09:26:14 +0000 Subject: [PATCH 20/23] Fix compute error for remainder on mips --- src/layer/mips/binaryop_mips.cpp | 2 +- src/layer/mips/msa_mathfun.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/layer/mips/binaryop_mips.cpp b/src/layer/mips/binaryop_mips.cpp index d10641a3785..e407bc962c2 100644 --- a/src/layer/mips/binaryop_mips.cpp +++ b/src/layer/mips/binaryop_mips.cpp @@ -314,7 +314,7 @@ MAKE_FUNCTION(binary_op_rdiv, y / x, __msa_fdiv_w(y, x)) MAKE_FUNCTION(binary_op_rpow, (float)pow(y, x), pow_ps(y, x)) MAKE_FUNCTION(binary_op_atan2, (float)atan2(x, y), atan2_ps(x, y)) MAKE_FUNCTION(binary_op_ratan2, (float)atan2(y, x), atan2_ps(y, x)) -MAKE_FUNCTION(binary_op_remainder, remainder(x, y), remainder_ps(x, y)) +MAKE_FUNCTION(binary_op_remainder, remainderf(x, y), remainder_ps(x, y)) // *INDENT-ON* // clang-format on diff --git a/src/layer/mips/msa_mathfun.h b/src/layer/mips/msa_mathfun.h index a005b95564b..9350fd15c7e 100644 --- a/src/layer/mips/msa_mathfun.h +++ b/src/layer/mips/msa_mathfun.h @@ -273,10 +273,10 @@ static inline v4f32 remainder_ps(v4f32 x, v4f32 y) float tmpy[4]; __msa_st_w(x, tmpx, 0); __msa_st_w(y, tmpy, 0); - tmpx[0] = remainder(tmpx[0], tmpy[0]); - tmpx[1] = remainder(tmpx[1], tmpy[1]); - tmpx[2] = remainder(tmpx[2], tmpy[2]); - tmpx[3] = remainder(tmpx[3], tmpy[3]); + tmpx[0] = remainderf(tmpx[0], tmpy[0]); + tmpx[1] = remainderf(tmpx[1], tmpy[1]); + tmpx[2] = remainderf(tmpx[2], tmpy[2]); + tmpx[3] = remainderf(tmpx[3], tmpy[3]); return __msa_ld_w(tmpx, 0); } From a370f4184560035228d5c03a3e03d2885207931f Mon Sep 17 00:00:00 2001 From: Fisher Date: Tue, 17 Oct 2023 09:26:43 +0000 Subject: [PATCH 21/23] Preprocess divisor for remainder on unittest --- tests/test_binaryop_3.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_binaryop_3.cpp b/tests/test_binaryop_3.cpp index 42e6ed53ddc..d4bf5faceba 100644 --- a/tests/test_binaryop_3.cpp +++ b/tests/test_binaryop_3.cpp @@ -55,6 +55,14 @@ static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b, int flag) b[i] = 0.001f; } } + if (op_type == 12) { + // divisor must be non-zero for remainder + b = b.clone(); + for (int i = 0; i < b.total(); i++) { + if (b[i] == 0.f) + b[i] = 0.001f; + } + } ncnn::ParamDict pd; pd.set(0, op_type); From b4b198e9b4d9b118d6d5a731faf5585768881b33 Mon Sep 17 00:00:00 2001 From: Fisher Date: Wed, 18 Oct 2023 03:06:13 +0000 Subject: [PATCH 22/23] Fix build error --- src/layer/loongarch/lsx_mathfun.h | 2 +- src/layer/mips/msa_mathfun.h | 6 +++--- src/layer/riscv/binaryop_riscv.cpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h index 8c15d94adb9..a09cd6d711a 100644 --- a/src/layer/loongarch/lsx_mathfun.h +++ b/src/layer/loongarch/lsx_mathfun.h @@ -279,7 +279,7 @@ static inline __m128 remainder_ps(__m128 x, __m128 y) tmpx[1] = remainderf(tmpx[1], tmpy[1]); tmpx[2] = remainderf(tmpx[2], tmpy[2]); tmpx[3] = remainderf(tmpx[3], tmpy[3]); - return __lsx_vld(tmpx, 0); + return (__m128)__lsx_vld(tmpx, 0); } #endif // LSX_MATHFUN_H diff --git a/src/layer/mips/msa_mathfun.h b/src/layer/mips/msa_mathfun.h index 9350fd15c7e..07b30ea7f85 100644 --- a/src/layer/mips/msa_mathfun.h +++ b/src/layer/mips/msa_mathfun.h @@ -271,13 +271,13 @@ static inline v4f32 remainder_ps(v4f32 x, v4f32 y) { float tmpx[4]; float tmpy[4]; - __msa_st_w(x, tmpx, 0); - __msa_st_w(y, tmpy, 0); + __msa_st_w((v4i32)x, tmpx, 0); + __msa_st_w((v4i32)y, tmpy, 0); tmpx[0] = remainderf(tmpx[0], tmpy[0]); tmpx[1] = remainderf(tmpx[1], tmpy[1]); tmpx[2] = remainderf(tmpx[2], tmpy[2]); tmpx[3] = remainderf(tmpx[3], tmpy[3]); - return __msa_ld_w(tmpx, 0); + return (v4f32)__msa_ld_w(tmpx, 0); } #endif // MSA_MATHFUN_H diff --git a/src/layer/riscv/binaryop_riscv.cpp b/src/layer/riscv/binaryop_riscv.cpp index efb87f0b245..e9b3e17e16b 100644 --- a/src/layer/riscv/binaryop_riscv.cpp +++ b/src/layer/riscv/binaryop_riscv.cpp @@ -825,7 +825,7 @@ MAKE_FUNCTION(binary_op_rdiv_fp16s, y / x, vfdiv_vv_f16m8(y, x, vl), vfrdiv_vf_f MAKE_FUNCTION(binary_op_rpow_fp16s, (__fp16)pow((float)y, (float)x), pow_ps(y, x, vl), pow_ps(vfmv_v_f_f16m8(y, vl), x, vl), pow_ps(y, vfmv_v_f_f16m8(x, vl), vl)) MAKE_FUNCTION(binary_op_atan2_fp16s, (__fp16)atan2((float)x, (float)y), atan2_ps(x, y, vl), atan2_ps(x, vfmv_v_f_f16m8(y, vl), vl), atan2_ps(vfmv_v_f_f16m8(x, vl), y, vl)) MAKE_FUNCTION(binary_op_ratan2_fp16s, (__fp16)atan2((float)y, (float)x), atan2_ps(y, x, vl), atan2_ps(vfmv_v_f_f16m8(y, vl), x, vl), atan2_ps(y, vfmv_v_f_f16m8(x, vl), vl)) -MAKE_FUNCTION(binary_op_remainder_fp16s, (__fp16)remainderf((float)x, (float)y), remainder_ps(x, y, vl), remainder_ps(x, vfmv_v_f_f32m8(y, vl), vl), remainder_ps(vfmv_v_f_f32m8(x, vl), y, vl)) +MAKE_FUNCTION(binary_op_remainder_fp16s, (__fp16)remainderf((float)x, (float)y), remainder_ps(x, y, vl), remainder_ps(x, vfmv_v_f_f16m8(y, vl), vl), remainder_ps(vfmv_v_f_f16m8(x, vl), y, vl)) // *INDENT-ON* // clang-format on From d917d97a11a23a4f356a7888859de7a1e03b3552 Mon Sep 17 00:00:00 2001 From: nihui Date: Fri, 22 Dec 2023 03:44:56 +0000 Subject: [PATCH 23/23] apply code-format changes --- tests/test_binaryop_3.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_binaryop_3.cpp b/tests/test_binaryop_3.cpp index dd5b7abc8fe..1d557a2002c 100644 --- a/tests/test_binaryop_3.cpp +++ b/tests/test_binaryop_3.cpp @@ -55,10 +55,12 @@ static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b, int flag) b[i] = 0.001f; } } - if (op_type == 12) { + if (op_type == 12) + { // divisor must be non-zero for remainder b = b.clone(); - for (int i = 0; i < b.total(); i++) { + for (int i = 0; i < b.total(); i++) + { if (b[i] == 0.f) b[i] = 0.001f; }