diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ece7ada7392..3349ea506ebe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -290,6 +290,19 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips)") else() message(WARNING "The compiler does not support loongson mmi extension. NCNN_MMI will be OFF.") endif() +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch64|loongarch32)") + set(NCNN_TARGET_ARCH loongarch) + + include(CheckCXXCompilerFlag) + + check_cxx_compiler_flag("-mlsx" NCNN_COMPILER_SUPPORT_LOONGARCH_LSX) + + if(NCNN_COMPILER_SUPPORT_LOONGARCH_LSX) + option(NCNN_LSX "optimize loongarch platform with lsx extension" ON) + else() + message(WARNING "The compiler does not support lsx extension. NCNN_LSX will be OFF.") + endif() + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)") set(NCNN_TARGET_ARCH riscv) @@ -332,8 +345,6 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)") endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)") set(NCNN_TARGET_ARCH powerpc) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch)") - set(NCNN_TARGET_ARCH mips) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(xtensa)") set(NCNN_TARGET_ARCH xtensa) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x)") diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake index 8abb13331a9b..857d3b528bac 100644 --- a/cmake/ncnn_add_layer.cmake +++ b/cmake/ncnn_add_layer.cmake @@ -270,6 +270,12 @@ macro(ncnn_add_layer class) endif() endif() + if(NCNN_RUNTIME_CPU AND NCNN_TARGET_ARCH STREQUAL "loongarch") + if(NCNN_LSX) + ncnn_add_arch_opt_layer(${class} lsx "-mlsx") + endif() + endif() + if(NCNN_RUNTIME_CPU AND NCNN_RVV AND NCNN_TARGET_ARCH STREQUAL "riscv") if(NCNN_COMPILER_SUPPORT_RVV_ZFH) ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh") diff --git a/cmake/ncnn_generate_lsx_source.cmake b/cmake/ncnn_generate_lsx_source.cmake new file mode 100644 index 000000000000..4f8fb20299aa --- /dev/null +++ b/cmake/ncnn_generate_lsx_source.cmake @@ -0,0 +1,14 @@ + +# must define SRC DST CLASS + +file(READ ${SRC} source_data) + +# replace +string(TOUPPER ${CLASS} CLASS_UPPER) +string(TOLOWER ${CLASS} CLASS_LOWER) + +string(REGEX REPLACE "LAYER_${CLASS_UPPER}_LOONGARCH_H" "LAYER_${CLASS_UPPER}_LOONGARCH_LSX_H" source_data "${source_data}") +string(REGEX REPLACE "${CLASS}_loongarch" "${CLASS}_loongarch_lsx" source_data "${source_data}") +string(REGEX REPLACE "#include \"${CLASS_LOWER}_loongarch.h\"" "#include \"${CLASS_LOWER}_loongarch_lsx.h\"" source_data "${source_data}") + +file(WRITE ${DST} "${source_data}") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 11b8573462a8..bb9786749590 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -457,6 +457,12 @@ if(NCNN_TARGET_ARCH STREQUAL "mips") endif() endif() +if(NCNN_TARGET_ARCH STREQUAL "loongarch") + if(NOT NCNN_RUNTIME_CPU AND NCNN_LSX) + target_compile_options(ncnn PRIVATE -mlsx) + endif() +endif() + if(NCNN_TARGET_ARCH STREQUAL "riscv" AND NOT C906) if(NOT NCNN_RUNTIME_CPU AND NCNN_RVV) if(NCNN_COMPILER_SUPPORT_RVV_ZFH) diff --git a/src/cpu.cpp b/src/cpu.cpp index 197093d6dd21..ca90860ff012 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -159,7 +159,7 @@ static unsigned int get_elf_hwcap_from_proc_self_auxv(unsigned int type) return 0; } -#if __aarch64__ || __mips64 || __riscv_xlen == 64 +#if __aarch64__ || __mips64 || __riscv_xlen == 64 || __loongarch64 struct { uint64_t tag; @@ -236,6 +236,12 @@ static unsigned int g_hwcaps2 = get_elf_hwcap(AT_HWCAP2); #define HWCAP_LOONGSON_MMI (1 << 11) #endif +#if __loongarch64 +// from arch/loongarch/include/uapi/asm/hwcap.h +#define HWCAP_LOONGARCH_LSX (1 << 4) +#define HWCAP_LOONGARCH_LASX (1 << 5) +#endif + #if __riscv // from arch/riscv/include/uapi/asm/hwcap.h #define COMPAT_HWCAP_ISA_F (1 << ('F' - 'A')) @@ -1001,6 +1007,32 @@ int cpu_support_mips_msa() #endif } +int cpu_support_loongarch_lsx() +{ +#if defined __ANDROID__ || defined __linux__ +#if __loongarch64 + return g_hwcaps & HWCAP_LOONGARCH_LSX; +#else + return 0; +#endif +#else + return 0; +#endif +} + +int cpu_support_loongarch_lasx() +{ +#if defined __ANDROID__ || defined __linux__ +#if __loongarch64 + return g_hwcaps & HWCAP_LOONGARCH_LASX; +#else + return 0; +#endif +#else + return 0; +#endif +} + int cpu_support_loongson_mmi() { #if defined __ANDROID__ || defined __linux__ diff --git a/src/cpu.h b/src/cpu.h index 5a94106ef478..54bacc0c25af 100644 --- a/src/cpu.h +++ b/src/cpu.h @@ -93,6 +93,11 @@ NCNN_EXPORT int cpu_support_x86_avx512_bf16(); // avx512_fp16 = x86 avx512 fp16 NCNN_EXPORT int cpu_support_x86_avx512_fp16(); +// lsx = loongarch lsx +NCNN_EXPORT int cpu_support_loongarch_lsx(); +// lasx = loongarch lasx +NCNN_EXPORT int cpu_support_loongarch_lasx(); + // msa = mips mas NCNN_EXPORT int cpu_support_mips_msa(); // mmi = loongson mmi diff --git a/src/layer.cpp b/src/layer.cpp index 518b666ec23f..953aebcd2bd7 100644 --- a/src/layer.cpp +++ b/src/layer.cpp @@ -253,6 +253,13 @@ Layer* create_layer(int index) } else #endif // NCNN_RUNTIME_CPU && NCNN_AVX +#if NCNN_RUNTIME_CPU && NCNN_LSX + if (ncnn::cpu_support_loongarch_lsx()) + { + layer_creator = layer_registry_lsx[index].creator; + } + else +#endif // NCNN_RUNTIME_CPU && NCNN_LSX #if NCNN_RUNTIME_CPU && NCNN_MSA if (ncnn::cpu_support_mips_msa()) { diff --git a/src/layer/loongarch/absval_loongarch.cpp b/src/layer/loongarch/absval_loongarch.cpp new file mode 100644 index 000000000000..ea60b01eaf02 --- /dev/null +++ b/src/layer/loongarch/absval_loongarch.cpp @@ -0,0 +1,67 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "absval_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +namespace ncnn { + +AbsVal_loongarch::AbsVal_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int AbsVal_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128i _p = __lsx_vld(ptr, 0); + __m128i _outp = __lsx_vbitclri_w(_p, 31); + __lsx_vst(_outp, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = *ptr > 0 ? *ptr : -*ptr; + + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/absval_loongarch.h b/src/layer/loongarch/absval_loongarch.h new file mode 100644 index 000000000000..0a3143cea432 --- /dev/null +++ b/src/layer/loongarch/absval_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_ABSVAL_LOONGARCH_H +#define LAYER_ABSVAL_LOONGARCH_H + +#include "absval.h" + +namespace ncnn { + +class AbsVal_loongarch : virtual public AbsVal +{ +public: + AbsVal_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_ABSVAL_LOONGARCH_H diff --git a/src/layer/loongarch/batchnorm_loongarch.cpp b/src/layer/loongarch/batchnorm_loongarch.cpp new file mode 100644 index 000000000000..f0e33b78efdc --- /dev/null +++ b/src/layer/loongarch/batchnorm_loongarch.cpp @@ -0,0 +1,145 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "batchnorm_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +BatchNorm_loongarch::BatchNorm_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int BatchNorm_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int dims = bottom_top_blob.dims; + int elempack = bottom_top_blob.elempack; + + if (dims == 1) + { + int w = bottom_top_blob.w * elempack; + +#if __loongarch_sx + int nn_w = w / 4; + int remain_w_start = nn_w * 4; +#else + int remain_w_start = 0; +#endif // __loongarch_sx + + float* ptr = bottom_top_blob; + +#if __loongarch_sx + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < nn_w; i++) + { + float* ptr0 = ptr + i * 4; + + __m128 _p = (__m128)__lsx_vld(ptr0, 0); + __m128 _a = (__m128)__lsx_vld((const float*)a_data + i * 4, 0); + __m128 _b = (__m128)__lsx_vld((const float*)b_data + i * 4, 0); + _p = __lsx_vfmadd_s(_b, _p, _a); + __lsx_vst(_p, ptr0, 0); + } +#endif // __loongarch_sx + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_w_start; i < w; i++) + { + ptr[i] = b_data[i] * ptr[i] + a_data[i]; + } + } + + if (dims == 2) + { + int w = bottom_top_blob.w * elempack; + int h = bottom_top_blob.h; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.row(i); + float a = a_data[i]; + float b = b_data[i]; + + int j = 0; +#if __loongarch_sx + __m128 _a = elempack == 4 ? (__m128)__lsx_vld((const float*)a_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(a); + __m128 _b = elempack == 4 ? (__m128)__lsx_vld((const float*)b_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(b); + for (; j + 3 < w; j += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmadd_s(_b, _p, _a); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; j < w; j++) + { + *ptr = b * *ptr + a; + ptr++; + } + } + } + + if (dims == 3 || dims == 4) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int c = bottom_top_blob.c; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + float* ptr = bottom_top_blob.channel(q); + float a = a_data[q]; + float b = b_data[q]; + + int i = 0; +#if __loongarch_sx + __m128 _a = elempack == 4 ? (__m128)__lsx_vld((const float*)a_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(a); + __m128 _b = elempack == 4 ? (__m128)__lsx_vld((const float*)b_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(b); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmadd_s(_b, _p, _a); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = b * *ptr + a; + ptr++; + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/batchnorm_loongarch.h b/src/layer/loongarch/batchnorm_loongarch.h new file mode 100644 index 000000000000..8b38d5e1f666 --- /dev/null +++ b/src/layer/loongarch/batchnorm_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_BATCHNORM_LOONGARCH_H +#define LAYER_BATCHNORM_LOONGARCH_H + +#include "batchnorm.h" + +namespace ncnn { + +class BatchNorm_loongarch : virtual public BatchNorm +{ +public: + BatchNorm_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_BATCHNORM_LOONGARCH_H diff --git a/src/layer/loongarch/bias_loongarch.cpp b/src/layer/loongarch/bias_loongarch.cpp new file mode 100644 index 000000000000..74129a8d3284 --- /dev/null +++ b/src/layer/loongarch/bias_loongarch.cpp @@ -0,0 +1,70 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "bias_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +int Bias_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int size = w * h * d; + + const float* bias_ptr = bias_data; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + float bias = bias_ptr[q]; + +#if __loongarch_sx + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __loongarch_sx + +#if __loongarch_sx + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias); + for (; nn > 0; nn--) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = __lsx_vfadd_s(_p, _bias); + __lsx_vst(_outp, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + + for (; remain > 0; remain--) + { + *ptr = *ptr + bias; + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/bias_loongarch.h b/src/layer/loongarch/bias_loongarch.h new file mode 100644 index 000000000000..f122ffa0dd92 --- /dev/null +++ b/src/layer/loongarch/bias_loongarch.h @@ -0,0 +1,30 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_BIAS_LOONGARCH_H +#define LAYER_BIAS_LOONGARCH_H + +#include "bias.h" + +namespace ncnn { + +class Bias_loongarch : virtual public Bias +{ +public: + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_BIAS_LOONGARCH_H diff --git a/src/layer/loongarch/binaryop_loongarch.cpp b/src/layer/loongarch/binaryop_loongarch.cpp new file mode 100644 index 000000000000..7832c9ca732b --- /dev/null +++ b/src/layer/loongarch/binaryop_loongarch.cpp @@ -0,0 +1,1066 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "binaryop_loongarch.h" + +#include + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +namespace ncnn { + +BinaryOp_loongarch::BinaryOp_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +template +static int binary_op_2_3_4_20(const Mat& a, const Mat& b, Mat& c, const Option& opt) +{ + Op op; + + int w = b.w; + int h = b.h; + int d = b.d; + int channels = b.c; + int elempack = b.elempack; + int size = w * h * d * elempack; + + // type 2 3 4 20 + c.create_like(b, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float a0 = a[0]; + const float* ptr = b.channel(q); + float* outptr = c.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _a0 = __lsx_vreplfr2vr_s(a0); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_a0, _p); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = op(a0, *ptr); + ptr += 1; + outptr += 1; + } + } + + return 0; +} + +template +static int binary_op_6_11_16_25(const Mat& a, const Mat& b, Mat& c, const Option& opt) +{ + Op op; + + int w = a.w; + int h = a.h; + int d = a.d; + int channels = a.c; + int elempack = a.elempack; + int size = w * h * d * elempack; + + // type 6 11 16 25 + c.create_like(a, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float b0 = b[0]; + float* outptr = c.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _b0 = __lsx_vreplfr2vr_s(b0); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _b0); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = op(*ptr, b0); + ptr += 1; + outptr += 1; + } + } + + return 0; +} + +template +static int binary_op_7_13_19_29(const Mat& a, const Mat& b, Mat& c, const Option& opt) +{ + Op op; + + int w = a.w; + int h = a.h; + int d = a.d; + int channels = a.c; + int elempack = a.elempack; + int size = w * h * d * elempack; + + // type 7 13 19 29 + c.create_like(a, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __builtin_prefetch(ptr1 + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_p, _p1); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + ptr1 += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = op(*ptr, *ptr1); + ptr += 1; + ptr1 += 1; + outptr += 1; + } + } + + return 0; +} + +#if __loongarch_sx +// broadcasting rule +// https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting + +template +static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt) +{ + Op op; + + int w = a.w; + int h = a.h; + int d = a.d; + int channels = a.c; + int size = w * h * d; + size_t elemsize = a.elemsize; + int elempack = a.elempack; + + int w1 = b.w; + int h1 = b.h; + int d1 = b.d; + int channels1 = b.c; + int size1 = w1 * h1 * d1; + size_t elemsize1 = b.elemsize; + int elempack1 = b.elempack; + + if (a.dims == 4) + { + if (b.dims == 4) + { + // type 29 + return binary_op_7_13_19_29(a, b, c, opt); + } + + c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.dims == 3) + { + // type 28 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d; z++) + { + for (int y = 0; y < h; y++) + { + __m128 _b0 = (__m128)__lsx_vld(ptr1, 0); + for (int x = 0; x < w; x++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _b0); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } + + ptr1 += 4; + } + } + } + + return 0; + } + + if (b.dims == 2) + { + // type 27 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.row(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d; z++) + { + __m128 _b0 = (__m128)__lsx_vld(ptr1, 0); + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _b0); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } + } + + ptr1 += 4; + } + } + + return 0; + } + + if (b.dims == 1) + { + if (b.w == 1 && elempack1 == 1) + { + // type 25 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 26 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + __m128 _b0 = (__m128)__lsx_vld((const float*)b + q * 4, 0); + float* outptr = c.channel(q); + + for (int i = 0; i < size; i++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _b0); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } + } + + return 0; + } + } + else if (a.dims == 3) + { + if (b.dims == 4) + { + // type 23 + c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d1; z++) + { + for (int y = 0; y < h1; y++) + { + __m128 _a0 = (__m128)__lsx_vld(ptr, 0); + for (int x = 0; x < w1; x++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_a0, _p); + __lsx_vst(_outp, outptr, 0); + ptr1 += 4; + outptr += 4; + } + + ptr += 4; + } + } + } + + return 0; + } + + if (b.dims == 3) + { + if (w1 == 1 && h1 == 1 && channels1 == channels) + { + // special type 1 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* b0 = b.channel(q); + float* outptr = c.channel(q); + __m128 _b0 = (__m128)__lsx_vld(b0, 0); + for (int i = 0; i < size; i++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _b0); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } + } + + return 0; + } + + if (w1 == w && h1 == h && channels1 == 1 && elempack1 == 1) + { + // special type 2 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b; + float* outptr = c.channel(q); + for (int i = 0; i < size; i++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _p1 = __lsx_vreplfr2vr_s(ptr1[0]); + __m128 _outp = op(_p, _p1); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + ptr1 += 1; + outptr += 4; + } + } + + return 0; + } + + if (w == 1 && h == 1 && channels1 == channels) + { + // special type 3 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* a0 = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + __m128 _a0 = (__m128)__lsx_vld(a0, 0); + for (int i = 0; i < size1; i++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_a0, _p1); + __lsx_vst(_outp, outptr, 0); + ptr1 += 4; + outptr += 4; + } + } + + return 0; + } + + if (w1 == w && h1 == h && channels == 1 && elempack == 1) + { + // special type 4 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a; + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + for (int i = 0; i < size1; i++) + { + __builtin_prefetch(ptr + 16); + __builtin_prefetch(ptr1 + 16); + __m128 _p = __lsx_vreplfr2vr_s(ptr[0]); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_p, _p1); + __lsx_vst(_outp, outptr, 0); + ptr += 1; + ptr1 += 4; + outptr += 4; + } + } + + return 0; + } + + if (w != 1 && w1 == 1 && h1 == h && channels1 == channels) + { + // special type 5 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h; y++) + { + __m128 _p1 = (__m128)__lsx_vld(ptr1 + y * 4, 0); + for (int x = 0; x < w; x++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _p1); + __lsx_vst(_outp, outptr, 0); + + ptr += 4; + outptr += 4; + } + } + } + + return 0; + } + + if (w1 == w && h != 1 && h1 == 1 && channels1 == channels) + { + // special type 6 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr1 + x * 4, 0); + __m128 _outp = op(_p, _p1); + __lsx_vst(_outp, outptr, 0); + + ptr += 4; + outptr += 4; + } + } + } + + return 0; + } + + if (w1 != 1 && w == 1 && h1 == h && channels1 == channels) + { + // special type 7 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h1; y++) + { + __m128 _p = (__m128)__lsx_vld(ptr + y * 4, 0); + for (int x = 0; x < w1; x++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_p, _p1); + __lsx_vst(_outp, outptr, 0); + + ptr1 += 4; + outptr += 4; + } + } + } + + return 0; + } + + if (w1 == w && h1 != 1 && h == 1 && channels1 == channels) + { + // special type 8 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h1; y++) + { + for (int x = 0; x < w1; x++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p = (__m128)__lsx_vld(ptr + x * 4, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_p, _p1); + __lsx_vst(_outp, outptr, 0); + + ptr1 += 4; + outptr += 4; + } + } + } + + return 0; + } + + // type 19 + return binary_op_7_13_19_29(a, b, c, opt); + } + + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.dims == 2) + { + // type 18 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.row(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h; y++) + { + __m128 _b0 = (__m128)__lsx_vld(ptr1, 0); + for (int x = 0; x < w; x++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _b0); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } + + ptr1 += 4; + } + } + + return 0; + } + + if (b.dims == 1) + { + if (b.w == 1 && elempack1 == 1) + { + // type 16 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 17 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + __m128 _b0 = (__m128)__lsx_vld((const float*)b + q * 4, 0); + float* outptr = c.channel(q); + + for (int i = 0; i < size; i++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _b0); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } + } + + return 0; + } + } + else if (a.dims == 2) + { + if (b.dims == 4) + { + // type 22 + c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.row(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d1; z++) + { + __m128 _a0 = (__m128)__lsx_vld(ptr, 0); + for (int y = 0; y < h1; y++) + { + for (int x = 0; x < w1; x++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_a0, _p); + __lsx_vst(_outp, outptr, 0); + ptr1 += 4; + outptr += 4; + } + } + + ptr += 4; + } + } + + return 0; + } + + if (b.dims == 3) + { + // type 14 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.row(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h1; y++) + { + __m128 _a0 = (__m128)__lsx_vld(ptr, 0); + for (int x = 0; x < w1; x++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_a0, _p1); + __lsx_vst(_outp, outptr, 0); + ptr1 += 4; + outptr += 4; + } + + ptr += 4; + } + } + + return 0; + } + + c.create(w, h, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.dims == 2) + { + // type 13 + return binary_op_7_13_19_29(a, b, c, opt); + } + + if (b.dims == 1) + { + c.create(w, h, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.w == 1 && elempack1 == 1) + { + // type 11 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 12 + const float* ptr = a; + const float* ptr1 = b; + float* outptr = c; + + for (int y = 0; y < h; y++) + { + __m128 _b0 = (__m128)__lsx_vld(ptr1, 0); + for (int x = 0; x < w; x++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _b0); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } + + ptr1 += 4; + } + + return 0; + } + } + else if (a.dims == 1) + { + if (a.w == 1 && elempack == 1) + { + // type 2 3 4 20 + return binary_op_2_3_4_20(a, b, c, opt); + } + + if (b.dims == 4) + { + // type 21 + c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + __m128 _a0 = (__m128)__lsx_vld((const float*)a + q * 4, 0); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int i = 0; i < size1; i++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_a0, _p1); + __lsx_vst(_outp, outptr, 0); + ptr1 += 4; + outptr += 4; + } + } + + return 0; + } + + if (b.dims == 3) + { + // type 9 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + __m128 _a0 = (__m128)__lsx_vld((const float*)a + q * 4, 0); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int i = 0; i < size1; i++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_a0, _p1); + __lsx_vst(_outp, outptr, 0); + ptr1 += 4; + outptr += 4; + } + } + + return 0; + } + + if (b.dims == 2) + { + // type 8 + c.create(w1, h1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + const float* ptr = a; + const float* ptr1 = b; + float* outptr = c; + + for (int y = 0; y < h1; y++) + { + __m128 _a0 = (__m128)__lsx_vld(ptr, 0); + for (int x = 0; x < w1; x++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_a0, _p1); + __lsx_vst(_outp, outptr, 0); + ptr1 += 4; + outptr += 4; + } + + ptr += 4; + } + + return 0; + } + + if (b.dims == 1) + { + c.create(w, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.w == 1 && elempack1 == 1) + { + // type 6 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 7 + binary_op_7_13_19_29(a, b, c, opt); + } + } + + return 0; +} +#endif // __loongarch_sx + +template +static int binary_op_scalar_inplace(Mat& a, float b, const Option& opt) +{ + Op op; + + int w = a.w; + int h = a.h; + int d = a.d; + int channels = a.c; + int elempack = a.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = a.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _b = __lsx_vreplfr2vr_s(b); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = op(_p, _b); + __lsx_vst(_p, ptr, 0); + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = op(*ptr, b); + ptr++; + } + } + + return 0; +} + +namespace BinaryOp_loongarch_functor { + +#if __loongarch_sx +#define MAKE_FUNCTION(NAME, IMPL, IMPL4) \ + struct NAME \ + { \ + float operator()(const float& x, const float& y) const \ + { \ + return IMPL; \ + } \ + __m128 operator()(const __m128& x, const __m128& y) const \ + { \ + return IMPL4; \ + } \ + }; +#else +#define MAKE_FUNCTION(NAME, IMPL, IMPL4) \ + struct NAME \ + { \ + float operator()(const float& x, const float& y) const \ + { \ + return IMPL; \ + } \ + }; +#endif // __loongarch_sx + +// clang-format off +// *INDENT-OFF* +MAKE_FUNCTION(binary_op_add, x + y, __lsx_vfadd_s(x, y)) +MAKE_FUNCTION(binary_op_sub, x - y, __lsx_vfsub_s(x, y)) +MAKE_FUNCTION(binary_op_mul, x * y, __lsx_vfmul_s(x, y)) +MAKE_FUNCTION(binary_op_div, x / y, __lsx_vfdiv_s(x, y)) +MAKE_FUNCTION(binary_op_max, std::max(x, y), __lsx_vfmax_s(x, y)) +MAKE_FUNCTION(binary_op_min, std::min(x, y), __lsx_vfmin_s(x, y)) +MAKE_FUNCTION(binary_op_pow, (float)pow(x, y), pow_ps(x, y)) +MAKE_FUNCTION(binary_op_rsub, y - x, __lsx_vfsub_s(y, x)) +MAKE_FUNCTION(binary_op_rdiv, y / x, __lsx_vfdiv_s(y, x)) +// *INDENT-ON* +// clang-format on + +#undef MAKE_FUNCTION + +} // namespace BinaryOp_loongarch_functor + +int BinaryOp_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ +#if __loongarch_sx + using namespace BinaryOp_loongarch_functor; + + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& bottom_blob1 = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + int elempack = bottom_blob.elempack; + int elempack1 = bottom_blob1.elempack; + + if (elempack == 4 || elempack1 == 4) + { + if (op_type == Operation_ADD) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_SUB) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_MUL) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_DIV) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_MAX) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_MIN) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_POW) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_RSUB) + return binary_op_pack4(bottom_blob1, bottom_blob, top_blob, opt); + + if (op_type == Operation_RDIV) + return binary_op_pack4(bottom_blob1, bottom_blob, top_blob, opt); + } +#endif // __loongarch_sx + + return BinaryOp::forward(bottom_blobs, top_blobs, opt); +} + +int BinaryOp_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + using namespace BinaryOp_loongarch_functor; + + if (op_type == Operation_ADD) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_SUB) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_MUL) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_DIV) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_MAX) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_MIN) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_POW) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_RSUB) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_RDIV) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/binaryop_loongarch.h b/src/layer/loongarch/binaryop_loongarch.h new file mode 100644 index 000000000000..bcf9ef5442fc --- /dev/null +++ b/src/layer/loongarch/binaryop_loongarch.h @@ -0,0 +1,34 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_BINARYOP_LOONGARCH_H +#define LAYER_BINARYOP_LOONGARCH_H + +#include "binaryop.h" + +namespace ncnn { + +class BinaryOp_loongarch : virtual public BinaryOp +{ +public: + BinaryOp_loongarch(); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_BINARYOP_LOONGARCH_H diff --git a/src/layer/loongarch/cast_loongarch.cpp b/src/layer/loongarch/cast_loongarch.cpp new file mode 100644 index 000000000000..2e956657f142 --- /dev/null +++ b/src/layer/loongarch/cast_loongarch.cpp @@ -0,0 +1,209 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "cast_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +namespace ncnn { + +Cast_loongarch::Cast_loongarch() +{ + support_packing = true; +} + +int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + if (type_from == type_to) + { + top_blob = bottom_blob; + return 0; + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + size_t out_elemsize = elemsize; + if (type_to == 1) + { + if (type_from == 3) + { + Cast::forward(bottom_blob, top_blob, opt); + } + + // float32 + out_elemsize = 4 * elempack; + } + else if (type_to == 2) + { + // float16 + out_elemsize = 2 * elempack; + } + else if (type_to == 3) + { + // int8 + out_elemsize = elempack; + } + else if (type_to == 4) + { + // bfloat16 + out_elemsize = 2 * elempack; + } + + if (dims == 1) + { + top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); + } + else if (dims == 2) + { + top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); + } + else if (dims == 3) + { + top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); + } + else if (dims == 4) + { + top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator); + } + if (top_blob.empty()) + return -100; + + int size = w * h * d * elempack; + + if (type_from == 1 && type_to == 2) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + unsigned short* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(ptr + 16); + __m128 _p0 = (__m128)__lsx_vld(ptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr + 4, 0); + __m128i _p = __lsx_vfcvt_h_s(_p1, _p0); + __lsx_vst(_p, outptr, 0); + + ptr += 8; + outptr += 8; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = float32_to_float16(*ptr); + outptr++; + ptr++; + } + } + } + + if (type_from == 2 && type_to == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const unsigned short* ptr = bottom_blob.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(ptr + 16); + __m128i _p = __lsx_vld(ptr, 0); + __m128 _p0 = __lsx_vfcvtl_s_h(_p); + __m128 _p1 = __lsx_vfcvth_s_h(_p); + __lsx_vst(_p0, outptr, 0); + __lsx_vst(_p1, outptr + 4, 0); + + ptr += 8; + outptr += 8; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = float16_to_float32(*ptr); + outptr++; + ptr++; + } + } + } + + if (type_from == 3 && type_to == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const signed char* ptr = bottom_blob.channel(q); + float* outptr = top_blob.channel(q); + + for (int i = 0; i < size; i++) + { + outptr[i] = (float)ptr[i]; + } + } + } + + if (type_from == 4 && type_to == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const unsigned short* ptr = bottom_blob.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; + for (; i < size; i++) + { + *outptr = bfloat16_to_float32(*ptr); + outptr++; + ptr++; + } + } + } + + if (type_from == 1 && type_to == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + unsigned short* outptr = top_blob.channel(q); + + int i = 0; + for (; i < size; i++) + { + *outptr = float32_to_bfloat16(*ptr); + outptr++; + ptr++; + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/cast_loongarch.h b/src/layer/loongarch/cast_loongarch.h new file mode 100644 index 000000000000..1fe75c687d8e --- /dev/null +++ b/src/layer/loongarch/cast_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_CAST_LOONGARCH_H +#define LAYER_CAST_LOONGARCH_H + +#include "cast.h" + +namespace ncnn { + +class Cast_loongarch : virtual public Cast +{ +public: + Cast_loongarch(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_CAST_LOONGARCH_H diff --git a/src/layer/loongarch/clip_loongarch.cpp b/src/layer/loongarch/clip_loongarch.cpp new file mode 100644 index 000000000000..7cf0246d060c --- /dev/null +++ b/src/layer/loongarch/clip_loongarch.cpp @@ -0,0 +1,76 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "clip_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +Clip_loongarch::Clip_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int Clip_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _max = (__m128)__lsx_vreplfr2vr_s(max); + __m128 _min = (__m128)__lsx_vreplfr2vr_s(min); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmax_s(_p, _min); + _p = __lsx_vfmin_s(_p, _max); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + if (*ptr < min) + *ptr = min; + + if (*ptr > max) + *ptr = max; + + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/clip_loongarch.h b/src/layer/loongarch/clip_loongarch.h new file mode 100644 index 000000000000..43df62035ff3 --- /dev/null +++ b/src/layer/loongarch/clip_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_CLIP_LOONGARCH_H +#define LAYER_CLIP_LOONGARCH_H + +#include "clip.h" + +namespace ncnn { + +class Clip_loongarch : virtual public Clip +{ +public: + Clip_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_CLIP_LOONGARCH_H diff --git a/src/layer/loongarch/concat_loongarch.cpp b/src/layer/loongarch/concat_loongarch.cpp new file mode 100644 index 000000000000..50460f8c134b --- /dev/null +++ b/src/layer/loongarch/concat_loongarch.cpp @@ -0,0 +1,348 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "concat_loongarch.h" + +namespace ncnn { + +Concat_loongarch::Concat_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Concat_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + int dims = bottom_blobs[0].dims; + int positive_axis = axis < 0 ? dims + axis : axis; + + if (dims == 1) // positive_axis == 0 + { + // concat vector + // total length + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_w = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + top_w += bottom_blob.w * bottom_blob.elempack; + } + + int out_elempack = opt.use_packing_layout && top_w % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat& top_blob = top_blobs[0]; + top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + float* outptr = top_blob; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + + const float* ptr = bottom_blob; + memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize); + + outptr += bottom_blob.w * bottom_blob.elempack; + } + } + + if (dims == 2 && positive_axis == 0) + { + // concat image + int w = bottom_blobs[0].w; + + // total height + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_h = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + elemsize = std::min(elemsize, bottom_blob.elemsize); + elempack = std::min(elempack, bottom_blob.elempack); + top_h += bottom_blob.h * bottom_blob.elempack; + } + + int out_elempack = opt.use_packing_layout && top_h % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat& top_blob = top_blobs[0]; + top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + Mat top_blob_unpacked = top_blob; + if (elempack < out_elempack) + { + top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator); + if (top_blob_unpacked.empty()) + return -100; + } + + float* outptr = top_blob_unpacked; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + + if (bottom_blob.elempack == 4 && elempack == 1) + { + for (int i = 0; i < bottom_blob.h; i++) + { + const float* r0 = bottom_blob.row(i); + + float* outptr0 = outptr; + float* outptr1 = outptr + w; + float* outptr2 = outptr + w * 2; + float* outptr3 = outptr + w * 3; + + for (int j = 0; j < w; j++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; + + r0 += 4; + } + + outptr += w * 4; + } + } + else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4) + { + int size = w * bottom_blob.h; + + const float* ptr = bottom_blob; + memcpy(outptr, ptr, size * bottom_blob.elemsize); + + outptr += size * bottom_blob.elempack; + } + } + + // packing + if (elempack < out_elempack) + { + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + } + } + + if (dims == 2 && positive_axis == 1) + { + // interleave image row + int h = bottom_blobs[0].h; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total width + int top_w = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + top_w += bottom_blob.w; + } + + Mat& top_blob = top_blobs[0]; + top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* outptr = top_blob.row(i); + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + + const float* ptr = bottom_blob.row(i); + memcpy(outptr, ptr, bottom_blob.w * elemsize); + + outptr += bottom_blob.w * elempack; + } + } + } + + if (dims == 3 && positive_axis == 0) + { + // concat dim + int w = bottom_blobs[0].w; + int h = bottom_blobs[0].h; + + // total channels + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_channels = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + elemsize = std::min(elemsize, bottom_blob.elemsize); + elempack = std::min(elempack, bottom_blob.elempack); + top_channels += bottom_blob.c * bottom_blob.elempack; + } + + int out_elempack = opt.use_packing_layout && top_channels % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat& top_blob = top_blobs[0]; + top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + Mat top_blob_unpacked = top_blob; + if (elempack < out_elempack) + { + top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_allocator); + if (top_blob_unpacked.empty()) + return -100; + } + + int p = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + + if (bottom_blob.elempack == 4 && elempack == 1) + { + int size = bottom_blob.w * bottom_blob.h; + + for (int q = 0; q < bottom_blob.c; q++) + { + const float* r0 = bottom_blob.channel(q); + + float* outptr0 = top_blob_unpacked.channel(p); + float* outptr1 = top_blob_unpacked.channel(p + 1); + float* outptr2 = top_blob_unpacked.channel(p + 2); + float* outptr3 = top_blob_unpacked.channel(p + 3); + + for (int i = 0; i < size; i++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; + + r0 += 4; + } + + p += 4; + } + } + else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4) + { + int size = bottom_blob.total(); + + const float* ptr = bottom_blob; + float* outptr = top_blob_unpacked.channel(p); + memcpy(outptr, ptr, size * bottom_blob.elemsize); + + p += bottom_blob.c; + } + } + + // packing + if (elempack < out_elempack) + { + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + } + } + + if (dims == 3 && positive_axis == 1) + { + // interleave dim height + int w = bottom_blobs[0].w; + int channels = bottom_blobs[0].c; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total height + int top_h = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + top_h += bottom_blob.h; + } + + Mat& top_blob = top_blobs[0]; + top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + + int size = bottom_blob.w * bottom_blob.h; + + const float* ptr = bottom_blob.channel(q); + memcpy(outptr, ptr, size * elemsize); + + outptr += size * elempack; + } + } + } + + if (dims == 3 && positive_axis == 2) + { + // interleave dim width + int h = bottom_blobs[0].h; + int channels = bottom_blobs[0].c; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total height + int top_w = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + top_w += bottom_blob.w; + } + + Mat& top_blob = top_blobs[0]; + top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + + for (int i = 0; i < h; i++) + { + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + + const float* ptr = bottom_blob.channel(q).row(i); + memcpy(outptr, ptr, bottom_blob.w * elemsize); + + outptr += bottom_blob.w * elempack; + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/concat_loongarch.h b/src/layer/loongarch/concat_loongarch.h new file mode 100644 index 000000000000..934c85244df3 --- /dev/null +++ b/src/layer/loongarch/concat_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_CONCAT_LOONGARCH_H +#define LAYER_CONCAT_LOONGARCH_H + +#include "concat.h" + +namespace ncnn { + +class Concat_loongarch : virtual public Concat +{ +public: + Concat_loongarch(); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_CONCAT_LOONGARCH_H diff --git a/src/layer/loongarch/convolution1d_loongarch.cpp b/src/layer/loongarch/convolution1d_loongarch.cpp new file mode 100644 index 000000000000..0b1a11c868f0 --- /dev/null +++ b/src/layer/loongarch/convolution1d_loongarch.cpp @@ -0,0 +1,379 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution1d_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_activation.h" +#include "loongarch_usability.h" + +namespace ncnn { + +Convolution1D_loongarch::Convolution1D_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Convolution1D_loongarch::create_pipeline(const Option& opt) +{ + if (dynamic_weight) + return 0; + + const int num_input = weight_data_size / kernel_w / num_output; + + int elempack = 1; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + elempack = num_input % 4 == 0 ? 4 : 1; + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + + // src = kw-inch-outch + // dst = pb-pa-kw-inch/pa-outch/pb + { + Mat weight_data_r2 = weight_data.reshape(kernel_w, num_input, num_output); + + weight_data_packed.create(kernel_w, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + float* g00 = weight_data_packed.channel(q / out_elempack); + + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + { + for (int k = 0; k < kernel_w; k++) + { + for (int i = 0; i < elempack; i++) + { + for (int j = 0; j < out_elempack; j++) + { + const float* k00 = weight_data_r2.channel(q + j).row(p + i); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } + } + + return 0; +} + +int Convolution1D_loongarch::destroy_pipeline(const Option& /*opt*/) +{ + return 0; +} + +int Convolution1D_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + const int outw = (w - kernel_extent_w) / stride_w + 1; + const int outh = num_output / out_elempack; + + top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __loongarch_sx + if (elempack == 4 && out_elempack == 4) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outh; p++) + { + float* outptr = top_blob.row(p); + + for (int j = 0; j < outw; j++) + { + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0); + } + + const float* kptr = weight_data_packed.channel(p); + + for (int q = 0; q < h; q++) + { + const float* sptr = bottom_blob_bordered.row(q) + j * stride_w * 4; + + for (int k = 0; k < kernel_w; k++) + { + __m128 _val0 = __lsx_vreplfr2vr_s(sptr[0]); + __m128 _val1 = __lsx_vreplfr2vr_s(sptr[1]); + __m128 _val2 = __lsx_vreplfr2vr_s(sptr[2]); + __m128 _val3 = __lsx_vreplfr2vr_s(sptr[3]); + + __m128 _w0 = (__m128)__lsx_vld(kptr, 0); + __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0); + __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0); + + _sum = __lsx_vfmadd_s(_w0, _val0, _sum); + _sum = __lsx_vfmadd_s(_w1, _val1, _sum); + _sum = __lsx_vfmadd_s(_w2, _val2, _sum); + _sum = __lsx_vfmadd_s(_w3, _val3, _sum); + + sptr += dilation_w * 4; + kptr += 16; + } + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr, 0); + outptr += 4; + } + } + } + } + + if (elempack == 1 && out_elempack == 4) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outh; p++) + { + float* outptr = top_blob.row(p); + + for (int j = 0; j < outw; j++) + { + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0); + } + + const float* kptr = weight_data_packed.channel(p); + + for (int q = 0; q < h; q++) + { + const float* sptr = bottom_blob_bordered.row(q) + j * stride_w; + + for (int k = 0; k < kernel_w; k++) + { + __m128 _val = __lsx_vreplfr2vr_s(sptr[0]); + __m128 _w = (__m128)__lsx_vld(kptr, 0); + _sum = __lsx_vfmadd_s(_w, _val, _sum); + + sptr += dilation_w; + kptr += 4; + } + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr, 0); + outptr += 4; + } + } + } + } + + if (elempack == 4 && out_elempack == 1) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outh; p++) + { + float* outptr = top_blob.row(p); + + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + const float* kptr = weight_data_packed.channel(p); + + for (int q = 0; q < h; q++) + { + const float* sptr = bottom_blob_bordered.row(q) + j * stride_w * 4; + + for (int k = 0; k < kernel_w; k++) + { + __m128 _val = (__m128)__lsx_vld(sptr, 0); + __m128 _w = (__m128)__lsx_vld(kptr, 0); + _sum = __lsx_vfmadd_s(_w, _val, _sum); + + sptr += dilation_w * 4; + kptr += 4; + } + } + + sum += __lsx_reduce_fadd_s(_sum); + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = sum; + } + } + } + } +#endif // __loongarch_sx + + if (elempack == 1 && out_elempack == 1) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outh; p++) + { + float* outptr = top_blob.row(p); + + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + const float* kptr = (const float*)weight_data + kernel_w * h * p; + + for (int q = 0; q < h; q++) + { + const float* sptr = bottom_blob_bordered.row(q) + j * stride_w; + + for (int k = 0; k < kernel_w; k++) + { + float val = sptr[0]; + float wt = kptr[0]; + sum += val * wt; + + sptr += dilation_w; + kptr += 1; + } + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = sum; + } + } + } + } + + return 0; +} + +int Convolution1D_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& _weight_data = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + const int _kernel_w = _weight_data.w; + const int _num_output = _weight_data.c * _weight_data.elempack; + + Mat weight_data_flattened; + flatten(_weight_data, weight_data_flattened, opt); + if (weight_data_flattened.empty()) + return -100; + + // weight_data_flattened as pack1 + weight_data_flattened.w *= weight_data_flattened.elempack; + weight_data_flattened.elemsize /= weight_data_flattened.elempack; + weight_data_flattened.elempack = 1; + + Mat bias_data_flattened; + if (bias_term) + { + const Mat& _bias_data = bottom_blobs[2]; + flatten(_bias_data, bias_data_flattened, opt); + if (bias_data_flattened.empty()) + return -100; + + // bias_data_flattened as pack1 + bias_data_flattened.w *= bias_data_flattened.elempack; + bias_data_flattened.elemsize /= bias_data_flattened.elempack; + bias_data_flattened.elempack = 1; + } + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D); + + ncnn::ParamDict pd; + pd.set(0, _num_output); + pd.set(1, _kernel_w); + pd.set(2, dilation_w); + pd.set(3, stride_w); + pd.set(4, pad_left); + pd.set(15, pad_right); + pd.set(18, pad_value); + pd.set(5, bias_term); + pd.set(6, weight_data_flattened.w); + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + ncnn::Mat weights[2]; + weights[0] = weight_data_flattened; + weights[1] = bias_data_flattened; + + op->load_model(ncnn::ModelBinFromMatArray(weights)); + + op->create_pipeline(opt); + + op->forward(bottom_blob, top_blob, opt); + + op->destroy_pipeline(opt); + + delete op; + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/convolution1d_loongarch.h b/src/layer/loongarch/convolution1d_loongarch.h new file mode 100644 index 000000000000..36393df45688 --- /dev/null +++ b/src/layer/loongarch/convolution1d_loongarch.h @@ -0,0 +1,41 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_CONVOLUTION1D_LOONGARCH_H +#define LAYER_CONVOLUTION1D_LOONGARCH_H + +#include "convolution1d.h" + +namespace ncnn { + +class Convolution1D_loongarch : virtual public Convolution1D +{ +public: + Convolution1D_loongarch(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + +public: + // packn + Mat weight_data_packed; +}; + +} // namespace ncnn + +#endif // LAYER_CONVOLUTION1D_LOONGARCH_H diff --git a/src/layer/loongarch/convolution_1x1.h b/src/layer/loongarch/convolution_1x1.h new file mode 100644 index 000000000000..83d3778411ae --- /dev/null +++ b/src/layer/loongarch/convolution_1x1.h @@ -0,0 +1,26 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_lsx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/loongarch/convolution_1x1_int8.h b/src/layer/loongarch/convolution_1x1_int8.h new file mode 100644 index 000000000000..08f439c484ae --- /dev/null +++ b/src/layer/loongarch/convolution_1x1_int8.h @@ -0,0 +1,83 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_int8_lsx(bottom_im2col, top_blob, kernel, opt); +} + +static void conv1x1s2_sgemm_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int tailstep = w - 2 * outw + w; + + Mat bottom_blob_shrinked; + bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const signed char* r0 = bottom_blob.channel(p); + signed char* outptr = bottom_blob_shrinked.channel(p); + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j + 3 < outw; j += 4) + { + outptr[0] = r0[0]; + outptr[1] = r0[2]; + outptr[2] = r0[4]; + outptr[3] = r0[6]; + + r0 += 8; + outptr += 4; + } + for (; j + 1 < outw; j += 2) + { + outptr[0] = r0[0]; + outptr[1] = r0[2]; + + r0 += 4; + outptr += 2; + } + for (; j < outw; j++) + { + outptr[0] = r0[0]; + + r0 += 2; + outptr += 1; + } + + r0 += tailstep; + } + } + + conv1x1s1_sgemm_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt); +} diff --git a/src/layer/loongarch/convolution_1x1_pack1to4_int8.h b/src/layer/loongarch/convolution_1x1_pack1to4_int8.h new file mode 100644 index 000000000000..00e1e2581417 --- /dev/null +++ b/src/layer/loongarch/convolution_1x1_pack1to4_int8.h @@ -0,0 +1,83 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_pack1to4_int8_lsx(bottom_im2col, top_blob, kernel, opt); +} + +static void conv1x1s2_sgemm_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int tailstep = w - 2 * outw + w; + + Mat bottom_blob_shrinked; + bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const signed char* r0 = bottom_blob.channel(p); + signed char* outptr = bottom_blob_shrinked.channel(p); + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j + 3 < outw; j += 4) + { + outptr[0] = r0[0]; + outptr[1] = r0[2]; + outptr[2] = r0[4]; + outptr[3] = r0[6]; + + r0 += 8; + outptr += 4; + } + for (; j + 1 < outw; j += 2) + { + outptr[0] = r0[0]; + outptr[1] = r0[2]; + + r0 += 4; + outptr += 2; + } + for (; j < outw; j++) + { + outptr[0] = r0[0]; + + r0 += 2; + outptr += 1; + } + + r0 += tailstep; + } + } + + conv1x1s1_sgemm_pack1to4_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt); +} diff --git a/src/layer/loongarch/convolution_1x1_pack4.h b/src/layer/loongarch/convolution_1x1_pack4.h new file mode 100644 index 000000000000..cf5a5b8e3638 --- /dev/null +++ b/src/layer/loongarch/convolution_1x1_pack4.h @@ -0,0 +1,65 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_pack4_lsx(bottom_im2col, top_blob, kernel, _bias, opt); +} + +static void conv1x1s2_sgemm_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int tailstep = (w - 2 * outw + w) * 4; + + Mat bottom_blob_shrinked; + bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const float* r0 = bottom_blob.channel(p); + float* outptr = bottom_blob_shrinked.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128 _val = (__m128)__lsx_vld(r0, 0); + __lsx_vst(_val, outptr, 0); + + r0 += 4 * 2; + outptr += 4; + } + + r0 += tailstep; + } + } + + conv1x1s1_sgemm_pack4_lsx(bottom_blob_shrinked, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/loongarch/convolution_1x1_pack4to1.h b/src/layer/loongarch/convolution_1x1_pack4to1.h new file mode 100644 index 000000000000..b87129091e4a --- /dev/null +++ b/src/layer/loongarch/convolution_1x1_pack4to1.h @@ -0,0 +1,65 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_pack4to1_lsx(bottom_im2col, top_blob, kernel, _bias, opt); +} + +static void conv1x1s2_sgemm_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int tailstep = (w - 2 * outw + w) * 4; + + Mat bottom_blob_shrinked; + bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const float* r0 = bottom_blob.channel(p); + float* outptr = bottom_blob_shrinked.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128 _val = (__m128)__lsx_vld(r0, 0); + __lsx_vst(_val, outptr, 0); + + r0 += 4 * 2; + outptr += 4; + } + + r0 += tailstep; + } + } + + conv1x1s1_sgemm_pack4to1_lsx(bottom_blob_shrinked, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/loongarch/convolution_1x1_pack8to1_int8.h b/src/layer/loongarch/convolution_1x1_pack8to1_int8.h new file mode 100644 index 000000000000..8df0e128b7fb --- /dev/null +++ b/src/layer/loongarch/convolution_1x1_pack8to1_int8.h @@ -0,0 +1,65 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_pack8to1_int8_lsx(bottom_im2col, top_blob, kernel, opt); +} + +static void conv1x1s2_sgemm_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int tailstep = w - 2 * outw + w; + + Mat bottom_blob_shrinked; + bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const int64_t* r0 = bottom_blob.channel(p); + int64_t* outptr = bottom_blob_shrinked.channel(p); + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + outptr[0] = r0[0]; + + r0 += 2; + outptr += 1; + } + + r0 += tailstep; + } + } + + conv1x1s1_sgemm_pack8to1_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt); +} diff --git a/src/layer/loongarch/convolution_1x1_pack8to4_int8.h b/src/layer/loongarch/convolution_1x1_pack8to4_int8.h new file mode 100644 index 000000000000..6aaa720d23d0 --- /dev/null +++ b/src/layer/loongarch/convolution_1x1_pack8to4_int8.h @@ -0,0 +1,65 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_pack8to4_int8_lsx(bottom_im2col, top_blob, kernel, opt); +} + +static void conv1x1s2_sgemm_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int tailstep = w - 2 * outw + w; + + Mat bottom_blob_shrinked; + bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const int64_t* r0 = bottom_blob.channel(p); + int64_t* outptr = bottom_blob_shrinked.channel(p); + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + outptr[0] = r0[0]; + + r0 += 2; + outptr += 1; + } + + r0 += tailstep; + } + } + + conv1x1s1_sgemm_pack8to4_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt); +} diff --git a/src/layer/loongarch/convolution_3x3.h b/src/layer/loongarch/convolution_3x3.h new file mode 100644 index 000000000000..66e10106b46c --- /dev/null +++ b/src/layer/loongarch/convolution_3x3.h @@ -0,0 +1,412 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd23_transform_kernel_lsx(const Mat& kernel, Mat& kernel_tm2, int inch, int outch, const Option& opt) +{ + Mat kernel_tm(4 * 4, inch, outch); + + // G + const float ktm[4][3] = { + {1.0f, 0.0f, 0.0f}, + {1.0f / 2, 1.0f / 2, 1.0f / 2}, + {1.0f / 2, -1.0f / 2, 1.0f / 2}, + {0.0f, 0.0f, 1.0f} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9; + float* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + // h + float tmp[4][3]; + for (int i = 0; i < 4; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 4; j++) + { + float* tmpp = &tmp[j][0]; + + for (int i = 0; i < 4; i++) + { + kernel_tm0[j * 4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 16-inch-outch + // dst = inch-16-outch +#if __loongarch_sx + kernel_tm2.create(8 * inch, 16, outch / 8 + (outch % 8) / 4 + outch % 4); +#else + kernel_tm2.create(2 * inch, 16, outch / 2 + outch % 2); +#endif + + int q = 0; +#if __loongarch_sx + for (; q + 7 < outch; q += 8) + { + Mat g0 = kernel_tm2.channel(q / 8); + + for (int k = 0; k < 16; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p < inch; p++) + { + for (int i = 0; i < 8; i++) + { + const float* k00 = kernel_tm.channel(q + i).row(p); + g00[0] = k00[k]; + g00++; + } + } + } + } + for (; q + 3 < outch; q += 4) + { + Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4); + + for (int k = 0; k < 16; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p < inch; p++) + { + for (int i = 0; i < 4; i++) + { + const float* k00 = kernel_tm.channel(q + i).row(p); + g00[0] = k00[k]; + g00++; + } + } + } + } +#else // __loongarch_sx + for (; q + 1 < outch; q += 2) + { + Mat g0 = kernel_tm2.channel(q / 2); + + for (int k = 0; k < 16; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p < inch; p++) + { + for (int i = 0; i < 2; i++) + { + const float* k00 = kernel_tm.channel(q + i).row(p); + g00[0] = k00[k]; + g00++; + } + } + } + } +#endif // __loongarch_sx + for (; q < outch; q++) + { +#if __loongarch_sx + Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4 + q % 4); +#else + Mat g0 = kernel_tm2.channel(q / 2 + q % 2); +#endif + + for (int k = 0; k < 16; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p < inch; p++) + { + const float* k00 = kernel_tm.channel(q).row(p); + g00[0] = k00[k]; + g00++; + } + } + } +} + +static void conv3x3s1_winograd23_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 2n+2, winograd F(2,3) + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 1) / 2 * 2; + outh = (outh + 1) / 2 * 2; + + w = outw + 2; + h = outh + 2; + Option opt_b = opt; + opt_b.blob_allocator = opt.workspace_allocator; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tiles = outw / 2; + int h_tiles = outh / 2; + int tiles = w_tiles * h_tiles; + + bottom_blob_tm.create(tiles, 16, inch, 4u, opt.workspace_allocator); + conv3x3s1_winograd23_transform_input_lsx(bottom_blob_bordered, bottom_blob_tm, opt); + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + convolution_winograd_dot_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator); + } + { + conv3x3s1_winograd23_transform_output_lsx(top_blob_tm, top_blob_bordered, bias, opt); + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} + +static void conv3x3s1_winograd43_transform_kernel_lsx(const Mat& kernel, Mat& kernel_tm2, int inch, int outch, const Option& opt) +{ + Mat kernel_tm(6 * 6, inch, outch); + + // G + const float ktm[6][3] = { + {1.0f / 4, 0.0f, 0.0f}, + {-1.0f / 6, -1.0f / 6, -1.0f / 6}, + {-1.0f / 6, 1.0f / 6, -1.0f / 6}, + {1.0f / 24, 1.0f / 12, 1.0f / 6}, + {1.0f / 24, -1.0f / 12, 1.0f / 6}, + {0.0f, 0.0f, 1.0f} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9; + float* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + // h + float tmp[6][3]; + for (int i = 0; i < 6; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) + { + float* tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) + { + kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 36-inch-outch + // dst = inch-36-outch +#if __loongarch_sx + kernel_tm2.create(8 * inch, 36, outch / 8 + (outch % 8) / 4 + outch % 4); +#else + kernel_tm2.create(2 * inch, 36, outch / 2 + outch % 2); +#endif + + int q = 0; +#if __loongarch_sx + for (; q + 7 < outch; q += 8) + { + Mat g0 = kernel_tm2.channel(q / 8); + + for (int k = 0; k < 36; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p < inch; p++) + { + for (int i = 0; i < 8; i++) + { + const float* k00 = kernel_tm.channel(q + i).row(p); + g00[0] = k00[k]; + g00++; + } + } + } + } + for (; q + 3 < outch; q += 4) + { + Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4); + + for (int k = 0; k < 36; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p < inch; p++) + { + for (int i = 0; i < 4; i++) + { + const float* k00 = kernel_tm.channel(q + i).row(p); + g00[0] = k00[k]; + g00++; + } + } + } + } +#else // __loongarch_sx + for (; q + 1 < outch; q += 2) + { + Mat g0 = kernel_tm2.channel(q / 2); + + for (int k = 0; k < 36; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p < inch; p++) + { + for (int i = 0; i < 2; i++) + { + const float* k00 = kernel_tm.channel(q + i).row(p); + g00[0] = k00[k]; + g00++; + } + } + } + } +#endif // __loongarch_sx + for (; q < outch; q++) + { +#if __loongarch_sx + Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4 + q % 4); +#else + Mat g0 = kernel_tm2.channel(q / 2 + q % 2); +#endif + + for (int k = 0; k < 36; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p < inch; p++) + { + const float* k00 = kernel_tm.channel(q).row(p); + g00[0] = k00[k]; + g00++; + } + } + } +} + +static void conv3x3s1_winograd43_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 4n+2, winograd F(4,3) + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 3) / 4 * 4; + outh = (outh + 3) / 4 * 4; + + w = outw + 2; + h = outh + 2; + + Option opt_b = opt; + opt_b.blob_allocator = opt.workspace_allocator; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tiles = outw / 4; + int h_tiles = outh / 4; + int tiles = w_tiles * h_tiles; + + bottom_blob_tm.create(tiles, 36, inch, 4u, opt.workspace_allocator); + conv3x3s1_winograd43_transform_input_lsx(bottom_blob_bordered, bottom_blob_tm, opt); + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + convolution_winograd_dot_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator); + } + { + conv3x3s1_winograd43_transform_output_lsx(top_blob_tm, top_blob_bordered, bias, opt); + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} diff --git a/src/layer/loongarch/convolution_3x3_int8.h b/src/layer/loongarch/convolution_3x3_int8.h new file mode 100644 index 000000000000..3ea28dd09445 --- /dev/null +++ b/src/layer/loongarch/convolution_3x3_int8.h @@ -0,0 +1,252 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd43_transform_kernel_int8_lsx(const Mat& kernel, Mat& kernel_tm_packed, int inch, int outch, const Option& opt) +{ + // winograd43 transform kernel + Mat kernel_tm(6 * 6, inch, outch, (size_t)2u); + + const short ktm[6][3] = { + {6, 0, 0}, + {-4, -4, -4}, + {-4, 4, -4}, + {1, 2, 4}, + {1, -2, 4}, + {0, 0, 6} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9; + short* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel + const signed char* k0 = kernel0; + const signed char* k1 = kernel0 + 3; + const signed char* k2 = kernel0 + 6; + + // h + short tmp[6][3]; + for (int i = 0; i < 6; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) + { + short* tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) + { + kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 36-inch-outch + // dst = 2b-inch-36-outch/2b +#if __loongarch_sx + if (outch >= 4) + { + if (inch >= 4) + kernel_tm_packed.create(inch / 4 + inch % 4, 36, outch / 4 + outch % 4, (size_t)2u * 16, 16); + else + kernel_tm_packed.create(inch, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4); + } +#else // __loongarch_sx + if (outch >= 2) + { + kernel_tm_packed.create(inch, 36, outch / 2 + outch % 2, (size_t)2u * 2, 2); + } +#endif // __loongarch_sx + else + { +#if __loongarch_sx + if (inch >= 4) + kernel_tm_packed.create(inch / 4 + inch % 4, 36, outch, (size_t)2u * 4, 4); + else +#endif // __loongarch_sx + { + kernel_tm_packed.create(inch, 36, outch, (size_t)2u, 1); + } + } + + int p = 0; +#if __loongarch_sx + for (; p + 3 < outch; p += 4) + { + const Mat k0 = kernel_tm.channel(p); + const Mat k1 = kernel_tm.channel(p + 1); + const Mat k2 = kernel_tm.channel(p + 2); + const Mat k3 = kernel_tm.channel(p + 3); + + Mat g0 = kernel_tm_packed.channel(p / 4); + + for (int k = 0; k < 36; k++) + { + short* g00 = g0.row(k); + + int q = 0; + for (; q + 3 < inch; q += 4) + { + g00[0] = k0.row(q)[k]; + g00[1] = k0.row(q + 1)[k]; + g00[2] = k0.row(q + 2)[k]; + g00[3] = k0.row(q + 3)[k]; + g00[4] = k1.row(q)[k]; + g00[5] = k1.row(q + 1)[k]; + g00[6] = k1.row(q + 2)[k]; + g00[7] = k1.row(q + 3)[k]; + g00[8] = k2.row(q)[k]; + g00[9] = k2.row(q + 1)[k]; + g00[10] = k2.row(q + 2)[k]; + g00[11] = k2.row(q + 3)[k]; + g00[12] = k3.row(q)[k]; + g00[13] = k3.row(q + 1)[k]; + g00[14] = k3.row(q + 2)[k]; + g00[15] = k3.row(q + 3)[k]; + g00 += 16; + } + for (; q < inch; q++) + { + g00[0] = k0.row(q)[k]; + g00[1] = k1.row(q)[k]; + g00[2] = k2.row(q)[k]; + g00[3] = k3.row(q)[k]; + g00 += 4; + } + } + } +#else // __loongarch_sx + for (; p + 1 < outch; p += 2) + { + const Mat k0 = kernel_tm.channel(p); + const Mat k1 = kernel_tm.channel(p + 1); + + Mat g0 = kernel_tm_packed.channel(p / 2); + + for (int k = 0; k < 36; k++) + { + short* g00 = g0.row(k); + + int q = 0; + for (; q < inch; q++) + { + g00[0] = k0.row(q)[k]; + g00[1] = k1.row(q)[k]; + g00 += 2; + } + } + } +#endif // __loongarch_sx + for (; p < outch; p++) + { + const Mat k0 = kernel_tm.channel(p); + +#if __loongarch_sx + Mat g0 = kernel_tm_packed.channel(p / 4 + p % 4); +#else + Mat g0 = kernel_tm_packed.channel(p / 2 + p % 2); +#endif + + for (int k = 0; k < 36; k++) + { + short* g00 = g0.row(k); + + int q = 0; +#if __loongarch_sx + for (; q + 3 < inch; q += 4) + { + g00[0] = k0.row(q)[k]; + g00[1] = k0.row(q + 1)[k]; + g00[2] = k0.row(q + 2)[k]; + g00[3] = k0.row(q + 3)[k]; + g00 += 4; + } +#endif // __loongarch_sx + for (; q < inch; q++) + { + g00[0] = k0.row(q)[k]; + g00 += 1; + } + } + } +} + +static void conv3x3s1_winograd43_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + // size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 4n+2 + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 3) / 4 * 4; + outh = (outh + 3) / 4 * 4; + + w = outw + 2; + h = outh + 2; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tiles = outw / 4; + int h_tiles = outh / 4; + const int tiles = w_tiles * h_tiles; + + bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator); + conv3x3s1_winograd43_transform_input_int8_lsx(bottom_blob_bordered, bottom_blob_tm, opt); + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + convolution_winograd_dot_int8_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator); + } + { + conv3x3s1_winograd43_transform_output_int8_lsx(top_blob_tm, top_blob_bordered, opt); + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} diff --git a/src/layer/loongarch/convolution_3x3_pack1to4.h b/src/layer/loongarch/convolution_3x3_pack1to4.h new file mode 100644 index 000000000000..2bcb0ce166dd --- /dev/null +++ b/src/layer/loongarch/convolution_3x3_pack1to4.h @@ -0,0 +1,812 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int inch = bottom_blob.c; + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out0 = top_blob.channel(p); + + __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + out0.fill(_bias0); + + const float* k0 = kernel.channel(p); + + int q = 0; + for (; q < inch; q++) + { + float* outptr0 = out0; + + const Mat img0 = bottom_blob.channel(q); + + const float* r0 = img0.row(0); + const float* r1 = img0.row(1); + const float* r2 = img0.row(2); + + __m128 _k00 = (__m128)__lsx_vld(k0, 0); + __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0); + __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0); + __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0); + __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0); + + int i = 0; + for (; i < outh; i++) + { + int j = 0; + for (; j + 7 < outw; j += 8) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0); + __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0); + __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0); + __m128 _sum4 = (__m128)__lsx_vld(outptr0 + 4 * 4, 0); + __m128 _sum5 = (__m128)__lsx_vld(outptr0 + 4 * 5, 0); + __m128 _sum6 = (__m128)__lsx_vld(outptr0 + 4 * 6, 0); + __m128 _sum7 = (__m128)__lsx_vld(outptr0 + 4 * 7, 0); + + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r0n = __lsx_vld(r0 + 4, 0); + __m128i _r0nn = __lsx_vld(r0 + 8, 0); + + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3); + __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0); + __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1); + __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2); + __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3); + __m128 _r08 = (__m128)__lsx_vreplvei_w(_r0nn, 0); + __m128 _r09 = (__m128)__lsx_vreplvei_w(_r0nn, 1); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum1 = __lsx_vfmadd_s(_k00, _r01, _sum1); + _sum2 = __lsx_vfmadd_s(_k00, _r02, _sum2); + _sum3 = __lsx_vfmadd_s(_k00, _r03, _sum3); + _sum4 = __lsx_vfmadd_s(_k00, _r04, _sum4); + _sum5 = __lsx_vfmadd_s(_k00, _r05, _sum5); + _sum6 = __lsx_vfmadd_s(_k00, _r06, _sum6); + _sum7 = __lsx_vfmadd_s(_k00, _r07, _sum7); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum1 = __lsx_vfmadd_s(_k01, _r02, _sum1); + _sum2 = __lsx_vfmadd_s(_k01, _r03, _sum2); + _sum3 = __lsx_vfmadd_s(_k01, _r04, _sum3); + _sum4 = __lsx_vfmadd_s(_k01, _r05, _sum4); + _sum5 = __lsx_vfmadd_s(_k01, _r06, _sum5); + _sum6 = __lsx_vfmadd_s(_k01, _r07, _sum6); + _sum7 = __lsx_vfmadd_s(_k01, _r08, _sum7); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + _sum1 = __lsx_vfmadd_s(_k02, _r03, _sum1); + _sum2 = __lsx_vfmadd_s(_k02, _r04, _sum2); + _sum3 = __lsx_vfmadd_s(_k02, _r05, _sum3); + _sum4 = __lsx_vfmadd_s(_k02, _r06, _sum4); + _sum5 = __lsx_vfmadd_s(_k02, _r07, _sum5); + _sum6 = __lsx_vfmadd_s(_k02, _r08, _sum6); + _sum7 = __lsx_vfmadd_s(_k02, _r09, _sum7); + + __m128i _r1 = __lsx_vld(r1, 0); + __m128i _r1n = __lsx_vld(r1 + 4, 0); + __m128i _r1nn = __lsx_vld(r1 + 8, 0); + + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3); + __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0); + __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1); + __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2); + __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3); + __m128 _r18 = (__m128)__lsx_vreplvei_w(_r1nn, 0); + __m128 _r19 = (__m128)__lsx_vreplvei_w(_r1nn, 1); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum1 = __lsx_vfmadd_s(_k10, _r11, _sum1); + _sum2 = __lsx_vfmadd_s(_k10, _r12, _sum2); + _sum3 = __lsx_vfmadd_s(_k10, _r13, _sum3); + _sum4 = __lsx_vfmadd_s(_k10, _r14, _sum4); + _sum5 = __lsx_vfmadd_s(_k10, _r15, _sum5); + _sum6 = __lsx_vfmadd_s(_k10, _r16, _sum6); + _sum7 = __lsx_vfmadd_s(_k10, _r17, _sum7); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum1 = __lsx_vfmadd_s(_k11, _r12, _sum1); + _sum2 = __lsx_vfmadd_s(_k11, _r13, _sum2); + _sum3 = __lsx_vfmadd_s(_k11, _r14, _sum3); + _sum4 = __lsx_vfmadd_s(_k11, _r15, _sum4); + _sum5 = __lsx_vfmadd_s(_k11, _r16, _sum5); + _sum6 = __lsx_vfmadd_s(_k11, _r17, _sum6); + _sum7 = __lsx_vfmadd_s(_k11, _r18, _sum7); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + _sum1 = __lsx_vfmadd_s(_k12, _r13, _sum1); + _sum2 = __lsx_vfmadd_s(_k12, _r14, _sum2); + _sum3 = __lsx_vfmadd_s(_k12, _r15, _sum3); + _sum4 = __lsx_vfmadd_s(_k12, _r16, _sum4); + _sum5 = __lsx_vfmadd_s(_k12, _r17, _sum5); + _sum6 = __lsx_vfmadd_s(_k12, _r18, _sum6); + _sum7 = __lsx_vfmadd_s(_k12, _r19, _sum7); + + __m128i _r2 = __lsx_vld(r2, 0); + __m128i _r2n = __lsx_vld(r2 + 4, 0); + __m128i _r2nn = __lsx_vld(r2 + 8, 0); + + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3); + __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0); + __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1); + __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2); + __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3); + __m128 _r28 = (__m128)__lsx_vreplvei_w(_r2nn, 0); + __m128 _r29 = (__m128)__lsx_vreplvei_w(_r2nn, 1); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum1 = __lsx_vfmadd_s(_k20, _r21, _sum1); + _sum2 = __lsx_vfmadd_s(_k20, _r22, _sum2); + _sum3 = __lsx_vfmadd_s(_k20, _r23, _sum3); + _sum4 = __lsx_vfmadd_s(_k20, _r24, _sum4); + _sum5 = __lsx_vfmadd_s(_k20, _r25, _sum5); + _sum6 = __lsx_vfmadd_s(_k20, _r26, _sum6); + _sum7 = __lsx_vfmadd_s(_k20, _r27, _sum7); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum1 = __lsx_vfmadd_s(_k21, _r22, _sum1); + _sum2 = __lsx_vfmadd_s(_k21, _r23, _sum2); + _sum3 = __lsx_vfmadd_s(_k21, _r24, _sum3); + _sum4 = __lsx_vfmadd_s(_k21, _r25, _sum4); + _sum5 = __lsx_vfmadd_s(_k21, _r26, _sum5); + _sum6 = __lsx_vfmadd_s(_k21, _r27, _sum6); + _sum7 = __lsx_vfmadd_s(_k21, _r28, _sum7); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + _sum1 = __lsx_vfmadd_s(_k22, _r23, _sum1); + _sum2 = __lsx_vfmadd_s(_k22, _r24, _sum2); + _sum3 = __lsx_vfmadd_s(_k22, _r25, _sum3); + _sum4 = __lsx_vfmadd_s(_k22, _r26, _sum4); + _sum5 = __lsx_vfmadd_s(_k22, _r27, _sum5); + _sum6 = __lsx_vfmadd_s(_k22, _r28, _sum6); + _sum7 = __lsx_vfmadd_s(_k22, _r29, _sum7); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 4 * 2, 0); + __lsx_vst(_sum3, outptr0 + 4 * 3, 0); + __lsx_vst(_sum4, outptr0 + 4 * 4, 0); + __lsx_vst(_sum5, outptr0 + 4 * 5, 0); + __lsx_vst(_sum6, outptr0 + 4 * 6, 0); + __lsx_vst(_sum7, outptr0 + 4 * 7, 0); + + outptr0 += 4 * 8; + + r0 += 8; + r1 += 8; + r2 += 8; + } + for (; j + 3 < outw; j += 4) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0); + __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0); + __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0); + + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r0n = __lsx_vld(r0 + 4, 0); + + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3); + __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0); + __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum1 = __lsx_vfmadd_s(_k00, _r01, _sum1); + _sum2 = __lsx_vfmadd_s(_k00, _r02, _sum2); + _sum3 = __lsx_vfmadd_s(_k00, _r03, _sum3); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum1 = __lsx_vfmadd_s(_k01, _r02, _sum1); + _sum2 = __lsx_vfmadd_s(_k01, _r03, _sum2); + _sum3 = __lsx_vfmadd_s(_k01, _r04, _sum3); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + _sum1 = __lsx_vfmadd_s(_k02, _r03, _sum1); + _sum2 = __lsx_vfmadd_s(_k02, _r04, _sum2); + _sum3 = __lsx_vfmadd_s(_k02, _r05, _sum3); + + __m128i _r1 = __lsx_vld(r1, 0); + __m128i _r1n = __lsx_vld(r1 + 4, 0); + + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3); + __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0); + __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum1 = __lsx_vfmadd_s(_k10, _r11, _sum1); + _sum2 = __lsx_vfmadd_s(_k10, _r12, _sum2); + _sum3 = __lsx_vfmadd_s(_k10, _r13, _sum3); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum1 = __lsx_vfmadd_s(_k11, _r12, _sum1); + _sum2 = __lsx_vfmadd_s(_k11, _r13, _sum2); + _sum3 = __lsx_vfmadd_s(_k11, _r14, _sum3); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + _sum1 = __lsx_vfmadd_s(_k12, _r13, _sum1); + _sum2 = __lsx_vfmadd_s(_k12, _r14, _sum2); + _sum3 = __lsx_vfmadd_s(_k12, _r15, _sum3); + + __m128i _r2 = __lsx_vld(r2, 0); + __m128i _r2n = __lsx_vld(r2 + 4, 0); + + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3); + __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0); + __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum1 = __lsx_vfmadd_s(_k20, _r21, _sum1); + _sum2 = __lsx_vfmadd_s(_k20, _r22, _sum2); + _sum3 = __lsx_vfmadd_s(_k20, _r23, _sum3); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum1 = __lsx_vfmadd_s(_k21, _r22, _sum1); + _sum2 = __lsx_vfmadd_s(_k21, _r23, _sum2); + _sum3 = __lsx_vfmadd_s(_k21, _r24, _sum3); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + _sum1 = __lsx_vfmadd_s(_k22, _r23, _sum1); + _sum2 = __lsx_vfmadd_s(_k22, _r24, _sum2); + _sum3 = __lsx_vfmadd_s(_k22, _r25, _sum3); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 4 * 2, 0); + __lsx_vst(_sum3, outptr0 + 4 * 3, 0); + + outptr0 += 4 * 4; + + r0 += 4; + r1 += 4; + r2 += 4; + } + for (; j + 1 < outw; j += 2) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0); + + __m128i _r0 = __lsx_vld(r0, 0); + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum1 = __lsx_vfmadd_s(_k00, _r01, _sum1); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum1 = __lsx_vfmadd_s(_k01, _r02, _sum1); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + _sum1 = __lsx_vfmadd_s(_k02, _r03, _sum1); + + __m128i _r1 = __lsx_vld(r1, 0); + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum1 = __lsx_vfmadd_s(_k10, _r11, _sum1); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum1 = __lsx_vfmadd_s(_k11, _r12, _sum1); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + _sum1 = __lsx_vfmadd_s(_k12, _r13, _sum1); + + __m128i _r2 = __lsx_vld(r2, 0); + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum1 = __lsx_vfmadd_s(_k20, _r21, _sum1); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum1 = __lsx_vfmadd_s(_k21, _r22, _sum1); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + _sum1 = __lsx_vfmadd_s(_k22, _r23, _sum1); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + + outptr0 += 4 * 2; + + r0 += 2; + r1 += 2; + r2 += 2; + } + for (; j < outw; j++) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + + __m128i _r0 = __lsx_vld(r0, 0); + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + + __m128i _r1 = __lsx_vld(r1, 0); + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + + __m128i _r2 = __lsx_vld(r2, 0); + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; + + r0 += 1; + r1 += 1; + r2 += 1; + } + + r0 += 2; + r1 += 2; + r2 += 2; + } + + k0 += 9 * 4; + } + } +} + +static void conv3x3s2_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int tailstep = w - 2 * outw + w; + + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out0 = top_blob.channel(p); + + __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + out0.fill(_bias0); + + const float* k0 = kernel.channel(p); + + int q = 0; + for (; q < inch; q++) + { + float* outptr0 = out0; + + const Mat img0 = bottom_blob.channel(q); + + const float* r0 = img0.row(0); + const float* r1 = img0.row(1); + const float* r2 = img0.row(2); + + __m128 _k00 = (__m128)__lsx_vld(k0, 0); + __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0); + __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0); + __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0); + __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0); + + int i = 0; + for (; i < outh; i++) + { + int j = 0; + for (; j + 7 < outw; j += 8) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0); + __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0); + __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0); + __m128 _sum4 = (__m128)__lsx_vld(outptr0 + 4 * 4, 0); + __m128 _sum5 = (__m128)__lsx_vld(outptr0 + 4 * 5, 0); + __m128 _sum6 = (__m128)__lsx_vld(outptr0 + 4 * 6, 0); + __m128 _sum7 = (__m128)__lsx_vld(outptr0 + 4 * 7, 0); + + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r0n = __lsx_vld(r0 + 4, 0); + __m128i _r0nn = __lsx_vld(r0 + 8, 0); + __m128i _r0nnn = __lsx_vld(r0 + 12, 0); + + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3); + __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0); + __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1); + __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2); + __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3); + __m128 _r08 = (__m128)__lsx_vreplvei_w(_r0nn, 0); + __m128 _r09 = (__m128)__lsx_vreplvei_w(_r0nn, 1); + __m128 _r0a = (__m128)__lsx_vreplvei_w(_r0nn, 2); + __m128 _r0b = (__m128)__lsx_vreplvei_w(_r0nn, 3); + __m128 _r0c = (__m128)__lsx_vreplvei_w(_r0nnn, 0); + __m128 _r0d = (__m128)__lsx_vreplvei_w(_r0nnn, 1); + __m128 _r0e = (__m128)__lsx_vreplvei_w(_r0nnn, 2); + __m128 _r0f = (__m128)__lsx_vreplvei_w(_r0nnn, 3); + __m128 _r0g = __lsx_vreplfr2vr_s(r0[16]); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1); + _sum2 = __lsx_vfmadd_s(_k00, _r04, _sum2); + _sum3 = __lsx_vfmadd_s(_k00, _r06, _sum3); + _sum4 = __lsx_vfmadd_s(_k00, _r08, _sum4); + _sum5 = __lsx_vfmadd_s(_k00, _r0a, _sum5); + _sum6 = __lsx_vfmadd_s(_k00, _r0c, _sum6); + _sum7 = __lsx_vfmadd_s(_k00, _r0e, _sum7); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1); + _sum2 = __lsx_vfmadd_s(_k01, _r05, _sum2); + _sum3 = __lsx_vfmadd_s(_k01, _r07, _sum3); + _sum4 = __lsx_vfmadd_s(_k01, _r09, _sum4); + _sum5 = __lsx_vfmadd_s(_k01, _r0b, _sum5); + _sum6 = __lsx_vfmadd_s(_k01, _r0d, _sum6); + _sum7 = __lsx_vfmadd_s(_k01, _r0f, _sum7); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1); + _sum2 = __lsx_vfmadd_s(_k02, _r06, _sum2); + _sum3 = __lsx_vfmadd_s(_k02, _r08, _sum3); + _sum4 = __lsx_vfmadd_s(_k02, _r0a, _sum4); + _sum5 = __lsx_vfmadd_s(_k02, _r0c, _sum5); + _sum6 = __lsx_vfmadd_s(_k02, _r0e, _sum6); + _sum7 = __lsx_vfmadd_s(_k02, _r0g, _sum7); + + __m128i _r1 = __lsx_vld(r1, 0); + __m128i _r1n = __lsx_vld(r1 + 4, 0); + __m128i _r1nn = __lsx_vld(r1 + 8, 0); + __m128i _r1nnn = __lsx_vld(r1 + 12, 0); + + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3); + __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0); + __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1); + __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2); + __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3); + __m128 _r18 = (__m128)__lsx_vreplvei_w(_r1nn, 0); + __m128 _r19 = (__m128)__lsx_vreplvei_w(_r1nn, 1); + __m128 _r1a = (__m128)__lsx_vreplvei_w(_r1nn, 2); + __m128 _r1b = (__m128)__lsx_vreplvei_w(_r1nn, 3); + __m128 _r1c = (__m128)__lsx_vreplvei_w(_r1nnn, 0); + __m128 _r1d = (__m128)__lsx_vreplvei_w(_r1nnn, 1); + __m128 _r1e = (__m128)__lsx_vreplvei_w(_r1nnn, 2); + __m128 _r1f = (__m128)__lsx_vreplvei_w(_r1nnn, 3); + __m128 _r1g = __lsx_vreplfr2vr_s(r1[16]); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1); + _sum2 = __lsx_vfmadd_s(_k10, _r14, _sum2); + _sum3 = __lsx_vfmadd_s(_k10, _r16, _sum3); + _sum4 = __lsx_vfmadd_s(_k10, _r18, _sum4); + _sum5 = __lsx_vfmadd_s(_k10, _r1a, _sum5); + _sum6 = __lsx_vfmadd_s(_k10, _r1c, _sum6); + _sum7 = __lsx_vfmadd_s(_k10, _r1e, _sum7); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1); + _sum2 = __lsx_vfmadd_s(_k11, _r15, _sum2); + _sum3 = __lsx_vfmadd_s(_k11, _r17, _sum3); + _sum4 = __lsx_vfmadd_s(_k11, _r19, _sum4); + _sum5 = __lsx_vfmadd_s(_k11, _r1b, _sum5); + _sum6 = __lsx_vfmadd_s(_k11, _r1d, _sum6); + _sum7 = __lsx_vfmadd_s(_k11, _r1f, _sum7); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1); + _sum2 = __lsx_vfmadd_s(_k12, _r16, _sum2); + _sum3 = __lsx_vfmadd_s(_k12, _r18, _sum3); + _sum4 = __lsx_vfmadd_s(_k12, _r1a, _sum4); + _sum5 = __lsx_vfmadd_s(_k12, _r1c, _sum5); + _sum6 = __lsx_vfmadd_s(_k12, _r1e, _sum6); + _sum7 = __lsx_vfmadd_s(_k12, _r1g, _sum7); + + __m128i _r2 = __lsx_vld(r2, 0); + __m128i _r2n = __lsx_vld(r2 + 4, 0); + __m128i _r2nn = __lsx_vld(r2 + 8, 0); + __m128i _r2nnn = __lsx_vld(r2 + 12, 0); + + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3); + __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0); + __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1); + __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2); + __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3); + __m128 _r28 = (__m128)__lsx_vreplvei_w(_r2nn, 0); + __m128 _r29 = (__m128)__lsx_vreplvei_w(_r2nn, 1); + __m128 _r2a = (__m128)__lsx_vreplvei_w(_r2nn, 2); + __m128 _r2b = (__m128)__lsx_vreplvei_w(_r2nn, 3); + __m128 _r2c = (__m128)__lsx_vreplvei_w(_r2nnn, 0); + __m128 _r2d = (__m128)__lsx_vreplvei_w(_r2nnn, 1); + __m128 _r2e = (__m128)__lsx_vreplvei_w(_r2nnn, 2); + __m128 _r2f = (__m128)__lsx_vreplvei_w(_r2nnn, 3); + __m128 _r2g = __lsx_vreplfr2vr_s(r2[16]); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1); + _sum2 = __lsx_vfmadd_s(_k20, _r24, _sum2); + _sum3 = __lsx_vfmadd_s(_k20, _r26, _sum3); + _sum4 = __lsx_vfmadd_s(_k20, _r28, _sum4); + _sum5 = __lsx_vfmadd_s(_k20, _r2a, _sum5); + _sum6 = __lsx_vfmadd_s(_k20, _r2c, _sum6); + _sum7 = __lsx_vfmadd_s(_k20, _r2e, _sum7); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1); + _sum2 = __lsx_vfmadd_s(_k21, _r25, _sum2); + _sum3 = __lsx_vfmadd_s(_k21, _r27, _sum3); + _sum4 = __lsx_vfmadd_s(_k21, _r29, _sum4); + _sum5 = __lsx_vfmadd_s(_k21, _r2b, _sum5); + _sum6 = __lsx_vfmadd_s(_k21, _r2d, _sum6); + _sum7 = __lsx_vfmadd_s(_k21, _r2f, _sum7); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1); + _sum2 = __lsx_vfmadd_s(_k22, _r26, _sum2); + _sum3 = __lsx_vfmadd_s(_k22, _r28, _sum3); + _sum4 = __lsx_vfmadd_s(_k22, _r2a, _sum4); + _sum5 = __lsx_vfmadd_s(_k22, _r2c, _sum5); + _sum6 = __lsx_vfmadd_s(_k22, _r2e, _sum6); + _sum7 = __lsx_vfmadd_s(_k22, _r2g, _sum7); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 4 * 2, 0); + __lsx_vst(_sum3, outptr0 + 4 * 3, 0); + __lsx_vst(_sum4, outptr0 + 4 * 4, 0); + __lsx_vst(_sum5, outptr0 + 4 * 5, 0); + __lsx_vst(_sum6, outptr0 + 4 * 6, 0); + __lsx_vst(_sum7, outptr0 + 4 * 7, 0); + + outptr0 += 4 * 8; + + r0 += 16; + r1 += 16; + r2 += 16; + } + for (; j + 3 < outw; j += 4) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0); + __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0); + __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0); + + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r0n = __lsx_vld(r0 + 4, 0); + + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3); + __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0); + __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1); + __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2); + __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3); + __m128 _r08 = __lsx_vreplfr2vr_s(r0[8]); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1); + _sum2 = __lsx_vfmadd_s(_k00, _r04, _sum2); + _sum3 = __lsx_vfmadd_s(_k00, _r06, _sum3); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1); + _sum2 = __lsx_vfmadd_s(_k01, _r05, _sum2); + _sum3 = __lsx_vfmadd_s(_k01, _r07, _sum3); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1); + _sum2 = __lsx_vfmadd_s(_k02, _r06, _sum2); + _sum3 = __lsx_vfmadd_s(_k02, _r08, _sum3); + + __m128i _r1 = __lsx_vld(r1, 0); + __m128i _r1n = __lsx_vld(r1 + 4, 0); + + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3); + __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0); + __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1); + __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2); + __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3); + __m128 _r18 = __lsx_vreplfr2vr_s(r1[8]); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1); + _sum2 = __lsx_vfmadd_s(_k10, _r14, _sum2); + _sum3 = __lsx_vfmadd_s(_k10, _r16, _sum3); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1); + _sum2 = __lsx_vfmadd_s(_k11, _r15, _sum2); + _sum3 = __lsx_vfmadd_s(_k11, _r17, _sum3); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1); + _sum2 = __lsx_vfmadd_s(_k12, _r16, _sum2); + _sum3 = __lsx_vfmadd_s(_k12, _r18, _sum3); + + __m128i _r2 = __lsx_vld(r2, 0); + __m128i _r2n = __lsx_vld(r2 + 4, 0); + + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3); + __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0); + __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1); + __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2); + __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3); + __m128 _r28 = __lsx_vreplfr2vr_s(r2[8]); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1); + _sum2 = __lsx_vfmadd_s(_k20, _r24, _sum2); + _sum3 = __lsx_vfmadd_s(_k20, _r26, _sum3); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1); + _sum2 = __lsx_vfmadd_s(_k21, _r25, _sum2); + _sum3 = __lsx_vfmadd_s(_k21, _r27, _sum3); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1); + _sum2 = __lsx_vfmadd_s(_k22, _r26, _sum2); + _sum3 = __lsx_vfmadd_s(_k22, _r28, _sum3); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 4 * 2, 0); + __lsx_vst(_sum3, outptr0 + 4 * 3, 0); + + outptr0 += 4 * 4; + + r0 += 8; + r1 += 8; + r2 += 8; + } + for (; j + 1 < outw; j += 2) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0); + + __m128i _r0 = __lsx_vld(r0, 0); + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3); + __m128 _r04 = __lsx_vreplfr2vr_s(r0[4]); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1); + + __m128i _r1 = __lsx_vld(r1, 0); + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3); + __m128 _r14 = __lsx_vreplfr2vr_s(r1[4]); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1); + + __m128i _r2 = __lsx_vld(r2, 0); + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3); + __m128 _r24 = __lsx_vreplfr2vr_s(r2[4]); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + + outptr0 += 4 * 2; + + r0 += 4; + r1 += 4; + r2 += 4; + } + for (; j < outw; j++) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + + __m128i _r0 = __lsx_vld(r0, 0); + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + + __m128i _r1 = __lsx_vld(r1, 0); + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + + __m128i _r2 = __lsx_vld(r2, 0); + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; + + r0 += 2; + r1 += 2; + r2 += 2; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + + k0 += 9 * 4; + } + } +} diff --git a/src/layer/loongarch/convolution_3x3_pack4.h b/src/layer/loongarch/convolution_3x3_pack4.h new file mode 100644 index 000000000000..f06bb7e9068c --- /dev/null +++ b/src/layer/loongarch/convolution_3x3_pack4.h @@ -0,0 +1,425 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd63_transform_kernel_pack4_lsx(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt) +{ + // winograd63 transform kernel + Mat kernel_tm; + kernel_tm.create(8 * 8, inch, outch); + + const float ktm[8][3] = { + {1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {1.0f / 45, 1.0f / 90, 1.0f / 180}, + {1.0f / 45, -1.0f / 90, 1.0f / 180}, + {0.0f, 0.0f, 1.0f} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9; + float* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel, transposed + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + // h + float tmp[8][3]; + for (int i = 0; i < 8; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // v + for (int j = 0; j < 8; j++) + { + float* tmpp = &tmp[j][0]; + + for (int i = 0; i < 8; i++) + { + kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 64-inch-outch + // dst = pb-pa-inch/pa-64-outch/pb + kernel_tm_pack4.create(inch / 4, 64, outch / 4, (size_t)4u * 4 * 4, 4 * 4); + + for (int q = 0; q + 3 < outch; q += 4) + { + Mat g0 = kernel_tm_pack4.channel(q / 4); + + for (int k = 0; k < 64; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p + 3 < inch; p += 4) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + const float* k00 = kernel_tm.channel(q + j).row(p + i); + g00[0] = k00[k]; + g00++; + } + } + } + } + } +} + +static void conv3x3s1_winograd63_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 6n+2 + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 5) / 6 * 6; + outh = (outh + 5) / 6 * 6; + + w = outw + 2; + h = outh + 2; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tiles = outw / 6; + int h_tiles = outh / 6; + const int tiles = w_tiles * h_tiles; + + bottom_blob_tm.create(tiles, 64, inch, elemsize, elempack, opt.workspace_allocator); + conv3x3s1_winograd63_transform_input_pack4_lsx(bottom_blob_bordered, bottom_blob_tm, opt); + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + convolution_winograd_dot_pack4_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator); + } + { + conv3x3s1_winograd63_transform_output_pack4_lsx(top_blob_tm, top_blob_bordered, bias, opt); + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} + +static void conv3x3s1_winograd43_transform_kernel_pack4_lsx(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt) +{ + // winograd43 transform kernel + Mat kernel_tm(6 * 6, inch, outch); + + const float ktm[6][3] = { + {1.0f / 4, 0.0f, 0.0f}, + {-1.0f / 6, -1.0f / 6, -1.0f / 6}, + {-1.0f / 6, 1.0f / 6, -1.0f / 6}, + {1.0f / 24, 1.0f / 12, 1.0f / 6}, + {1.0f / 24, -1.0f / 12, 1.0f / 6}, + {0.0f, 0.0f, 1.0f} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9; + float* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + // h + float tmp[6][3]; + for (int i = 0; i < 6; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) + { + float* tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) + { + kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 36-inch-outch + // dst = pb-pa-inch/pa-36-outch/pb + kernel_tm_pack4.create(inch / 4, 36, outch / 4, (size_t)4u * 4 * 4, 4 * 4); + + for (int q = 0; q + 3 < outch; q += 4) + { + Mat g0 = kernel_tm_pack4.channel(q / 4); + + for (int k = 0; k < 36; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p + 3 < inch; p += 4) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + const float* k00 = kernel_tm.channel(q + j).row(p + i); + g00[0] = k00[k]; + g00++; + } + } + } + } + } +} + +static void conv3x3s1_winograd43_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 4n+2 + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 3) / 4 * 4; + outh = (outh + 3) / 4 * 4; + + w = outw + 2; + h = outh + 2; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tiles = outw / 4; + int h_tiles = outh / 4; + const int tiles = w_tiles * h_tiles; + + bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator); + conv3x3s1_winograd43_transform_input_pack4_lsx(bottom_blob_bordered, bottom_blob_tm, opt); + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + convolution_winograd_dot_pack4_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator); + } + { + conv3x3s1_winograd43_transform_output_pack4_lsx(top_blob_tm, top_blob_bordered, bias, opt); + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} + +static void conv3x3s1_winograd23_transform_kernel_pack4_lsx(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt) +{ + // winograd23 transform kernel + Mat kernel_tm(4 * 4, inch, outch); + + const float ktm[4][3] = { + {1.0f, 0.0f, 0.0f}, + {1.0f / 2, 1.0f / 2, 1.0f / 2}, + {1.0f / 2, -1.0f / 2, 1.0f / 2}, + {0.0f, 0.0f, 1.0f} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9; + float* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + // h + float tmp[4][3]; + for (int i = 0; i < 4; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 4; j++) + { + float* tmpp = &tmp[j][0]; + + for (int i = 0; i < 4; i++) + { + kernel_tm0[j * 4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 16-inch-outch + // dst = pb-pa-inch/pa-16-outch/pb + kernel_tm_pack4.create(inch / 4, 16, outch / 4, (size_t)4u * 4 * 4, 4 * 4); + + for (int q = 0; q + 3 < outch; q += 4) + { + Mat g0 = kernel_tm_pack4.channel(q / 4); + + for (int k = 0; k < 16; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p + 3 < inch; p += 4) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + const float* k00 = kernel_tm.channel(q + j).row(p + i); + g00[0] = k00[k]; + g00++; + } + } + } + } + } +} + +static void conv3x3s1_winograd23_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 2n+2 + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 1) / 2 * 2; + outh = (outh + 1) / 2 * 2; + + w = outw + 2; + h = outh + 2; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tiles = outw / 2; + int h_tiles = outh / 2; + const int tiles = w_tiles * h_tiles; + + bottom_blob_tm.create(tiles, 16, inch, elemsize, elempack, opt.workspace_allocator); + conv3x3s1_winograd23_transform_input_pack4_lsx(bottom_blob_bordered, bottom_blob_tm, opt); + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + convolution_winograd_dot_pack4_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator); + } + { + conv3x3s1_winograd23_transform_output_pack4_lsx(top_blob_tm, top_blob_bordered, bias, opt); + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} diff --git a/src/layer/loongarch/convolution_3x3_pack8to1_int8.h b/src/layer/loongarch/convolution_3x3_pack8to1_int8.h new file mode 100644 index 000000000000..3c4f97187533 --- /dev/null +++ b/src/layer/loongarch/convolution_3x3_pack8to1_int8.h @@ -0,0 +1,177 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_lsx(const Mat& kernel, Mat& kernel_tm_pack8to1, int inch, int outch, const Option& opt) +{ + // winograd43 transform kernel + Mat kernel_tm(6 * 6, inch, outch, (size_t)2u); + + const short ktm[6][3] = { + {6, 0, 0}, + {-4, -4, -4}, + {-4, 4, -4}, + {1, 2, 4}, + {1, -2, 4}, + {0, 0, 6} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9; + short* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel + const signed char* k0 = kernel0; + const signed char* k1 = kernel0 + 3; + const signed char* k2 = kernel0 + 6; + + // h + short tmp[6][3]; + for (int i = 0; i < 6; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) + { + short* tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) + { + kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 36-inch-outch + // dst = 4b-8a-inch/8a-36-outch/4b + kernel_tm_pack8to1.create(8 * inch / 8, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4); + + int p = 0; + for (; p + 3 < outch; p += 4) + { + const Mat k0 = kernel_tm.channel(p); + const Mat k1 = kernel_tm.channel(p + 1); + const Mat k2 = kernel_tm.channel(p + 2); + const Mat k3 = kernel_tm.channel(p + 3); + + Mat g0 = kernel_tm_pack8to1.channel(p / 4); + + for (int k = 0; k < 36; k++) + { + short* g00 = g0.row(k); + + for (int q = 0; q + 7 < inch; q += 8) + { + for (int i = 0; i < 8; i++) + { + g00[0] = k0.row(q + i)[k]; + g00[1] = k1.row(q + i)[k]; + g00[2] = k2.row(q + i)[k]; + g00[3] = k3.row(q + i)[k]; + + g00 += 4; + } + } + } + } + for (; p < outch; p++) + { + const Mat k0 = kernel_tm.channel(p); + + Mat g0 = kernel_tm_pack8to1.channel(p / 4 + p % 4); + + for (int k = 0; k < 36; k++) + { + short* g00 = g0.row(k); + + for (int q = 0; q + 7 < inch; q += 8) + { + for (int i = 0; i < 8; i++) + { + g00[0] = k0.row(q + i)[k]; + + g00 += 1; + } + } + } + } +} + +static void conv3x3s1_winograd43_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + // size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 4n+2 + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 3) / 4 * 4; + outh = (outh + 3) / 4 * 4; + + w = outw + 2; + h = outh + 2; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tiles = outw / 4; + int h_tiles = outh / 4; + const int tiles = w_tiles * h_tiles; + + bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator); + conv3x3s1_winograd43_transform_input_pack8_int8_lsx(bottom_blob_bordered, bottom_blob_tm, opt); + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + convolution_winograd_dot_pack8to1_int8_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator); + } + { + conv3x3s1_winograd43_transform_output_int8_lsx(top_blob_tm, top_blob_bordered, opt); + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} diff --git a/src/layer/loongarch/convolution_3x3_pack8to4_int8.h b/src/layer/loongarch/convolution_3x3_pack8to4_int8.h new file mode 100644 index 000000000000..bf328cee73f9 --- /dev/null +++ b/src/layer/loongarch/convolution_3x3_pack8to4_int8.h @@ -0,0 +1,161 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_lsx(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt) +{ + // winograd43 transform kernel + Mat kernel_tm(6 * 6, inch, outch, (size_t)2u); + + const short ktm[6][3] = { + {6, 0, 0}, + {-4, -4, -4}, + {-4, 4, -4}, + {1, 2, 4}, + {1, -2, 4}, + {0, 0, 6} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9; + short* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel + const signed char* k0 = kernel0; + const signed char* k1 = kernel0 + 3; + const signed char* k2 = kernel0 + 6; + + // h + short tmp[6][3]; + for (int i = 0; i < 6; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) + { + short* tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) + { + kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 36-inch-outch + // dst = 4b-8a-inch/8a-36-outch/4b + kernel_tm_pack8.create(inch / 8, 36, outch / 4, (size_t)2u * 32, 32); + + int q = 0; + for (; q + 3 < outch; q += 4) + { + const Mat k0 = kernel_tm.channel(q); + const Mat k1 = kernel_tm.channel(q + 1); + const Mat k2 = kernel_tm.channel(q + 2); + const Mat k3 = kernel_tm.channel(q + 3); + + Mat kernel_tm = kernel_tm_pack8.channel(q / 4); + + for (int k = 0; k < 36; k++) + { + short* g00 = kernel_tm.row(k); + + for (int p = 0; p + 7 < inch; p += 8) + { + for (int i = 0; i < 8; i++) + { + const short* k00 = k0.row(p + i); + const short* k10 = k1.row(p + i); + const short* k20 = k2.row(p + i); + const short* k30 = k3.row(p + i); + + g00[0] = k00[k]; + g00[1] = k10[k]; + g00[2] = k20[k]; + g00[3] = k30[k]; + + g00 += 4; + } + } + } + } +} + +static void conv3x3s1_winograd43_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + // size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 4n+2 + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 3) / 4 * 4; + outh = (outh + 3) / 4 * 4; + + w = outw + 2; + h = outh + 2; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tiles = outw / 4; + int h_tiles = outh / 4; + const int tiles = w_tiles * h_tiles; + + bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator); + conv3x3s1_winograd43_transform_input_pack8_int8_lsx(bottom_blob_bordered, bottom_blob_tm, opt); + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + convolution_winograd_dot_pack8to4_int8_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, 4u * 4, 4, opt.workspace_allocator); + } + { + conv3x3s1_winograd43_transform_output_pack4_int8_lsx(top_blob_tm, top_blob_bordered, opt); + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} diff --git a/src/layer/loongarch/convolution_7x7_pack1to4.h b/src/layer/loongarch/convolution_7x7_pack1to4.h new file mode 100644 index 000000000000..f57923b53d00 --- /dev/null +++ b/src/layer/loongarch/convolution_7x7_pack1to4.h @@ -0,0 +1,652 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv7x7s2_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int tailstep = w - 2 * outw + w; + + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out0 = top_blob.channel(p); + + __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + out0.fill(_bias0); + + for (int q = 0; q < inch; q++) + { + float* outptr0 = out0; + + const Mat img0 = bottom_blob.channel(q); + + const float* r0 = img0.row(0); + const float* r1 = img0.row(1); + const float* r2 = img0.row(2); + const float* r3 = img0.row(3); + const float* r4 = img0.row(4); + const float* r5 = img0.row(5); + const float* r6 = img0.row(6); + + const float* kptr = kernel.channel(p).row(q); + + int i = 0; + + for (; i < outh; i++) + { + int j = 0; + for (; j + 3 < outw; j += 4) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0); + __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0); + __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0); + + __m128 _k00 = (__m128)__lsx_vld(kptr, 0); + __m128 _k01 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k03 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k04 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k05 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k06 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r0n = __lsx_vld(r0 + 4, 0); + __m128i _r0nn = __lsx_vld(r0 + 8, 0); + + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3); + __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0); + __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1); + __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2); + __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3); + __m128 _r08 = (__m128)__lsx_vreplvei_w(_r0nn, 0); + __m128 _r09 = (__m128)__lsx_vreplvei_w(_r0nn, 1); + __m128 _r0a = (__m128)__lsx_vreplvei_w(_r0nn, 2); + __m128 _r0b = (__m128)__lsx_vreplvei_w(_r0nn, 3); + __m128 _r0c = __lsx_vreplfr2vr_s(r0[12]); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1); + _sum2 = __lsx_vfmadd_s(_k00, _r04, _sum2); + _sum3 = __lsx_vfmadd_s(_k00, _r06, _sum3); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1); + _sum2 = __lsx_vfmadd_s(_k01, _r05, _sum2); + _sum3 = __lsx_vfmadd_s(_k01, _r07, _sum3); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1); + _sum2 = __lsx_vfmadd_s(_k02, _r06, _sum2); + _sum3 = __lsx_vfmadd_s(_k02, _r08, _sum3); + _sum0 = __lsx_vfmadd_s(_k03, _r03, _sum0); + _sum1 = __lsx_vfmadd_s(_k03, _r05, _sum1); + _sum2 = __lsx_vfmadd_s(_k03, _r07, _sum2); + _sum3 = __lsx_vfmadd_s(_k03, _r09, _sum3); + _sum0 = __lsx_vfmadd_s(_k04, _r04, _sum0); + _sum1 = __lsx_vfmadd_s(_k04, _r06, _sum1); + _sum2 = __lsx_vfmadd_s(_k04, _r08, _sum2); + _sum3 = __lsx_vfmadd_s(_k04, _r0a, _sum3); + _sum0 = __lsx_vfmadd_s(_k05, _r05, _sum0); + _sum1 = __lsx_vfmadd_s(_k05, _r07, _sum1); + _sum2 = __lsx_vfmadd_s(_k05, _r09, _sum2); + _sum3 = __lsx_vfmadd_s(_k05, _r0b, _sum3); + _sum0 = __lsx_vfmadd_s(_k06, _r06, _sum0); + _sum1 = __lsx_vfmadd_s(_k06, _r08, _sum1); + _sum2 = __lsx_vfmadd_s(_k06, _r0a, _sum2); + _sum3 = __lsx_vfmadd_s(_k06, _r0c, _sum3); + + __m128 _k10 = (__m128)__lsx_vld(kptr, 0); + __m128 _k11 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k12 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k13 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k14 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k15 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k16 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r1 = __lsx_vld(r1, 0); + __m128i _r1n = __lsx_vld(r1 + 4, 0); + __m128i _r1nn = __lsx_vld(r1 + 8, 0); + + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3); + __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0); + __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1); + __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2); + __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3); + __m128 _r18 = (__m128)__lsx_vreplvei_w(_r1nn, 0); + __m128 _r19 = (__m128)__lsx_vreplvei_w(_r1nn, 1); + __m128 _r1a = (__m128)__lsx_vreplvei_w(_r1nn, 2); + __m128 _r1b = (__m128)__lsx_vreplvei_w(_r1nn, 3); + __m128 _r1c = __lsx_vreplfr2vr_s(r1[12]); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1); + _sum2 = __lsx_vfmadd_s(_k10, _r14, _sum2); + _sum3 = __lsx_vfmadd_s(_k10, _r16, _sum3); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1); + _sum2 = __lsx_vfmadd_s(_k11, _r15, _sum2); + _sum3 = __lsx_vfmadd_s(_k11, _r17, _sum3); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1); + _sum2 = __lsx_vfmadd_s(_k12, _r16, _sum2); + _sum3 = __lsx_vfmadd_s(_k12, _r18, _sum3); + _sum0 = __lsx_vfmadd_s(_k13, _r13, _sum0); + _sum1 = __lsx_vfmadd_s(_k13, _r15, _sum1); + _sum2 = __lsx_vfmadd_s(_k13, _r17, _sum2); + _sum3 = __lsx_vfmadd_s(_k13, _r19, _sum3); + _sum0 = __lsx_vfmadd_s(_k14, _r14, _sum0); + _sum1 = __lsx_vfmadd_s(_k14, _r16, _sum1); + _sum2 = __lsx_vfmadd_s(_k14, _r18, _sum2); + _sum3 = __lsx_vfmadd_s(_k14, _r1a, _sum3); + _sum0 = __lsx_vfmadd_s(_k15, _r15, _sum0); + _sum1 = __lsx_vfmadd_s(_k15, _r17, _sum1); + _sum2 = __lsx_vfmadd_s(_k15, _r19, _sum2); + _sum3 = __lsx_vfmadd_s(_k15, _r1b, _sum3); + _sum0 = __lsx_vfmadd_s(_k16, _r16, _sum0); + _sum1 = __lsx_vfmadd_s(_k16, _r18, _sum1); + _sum2 = __lsx_vfmadd_s(_k16, _r1a, _sum2); + _sum3 = __lsx_vfmadd_s(_k16, _r1c, _sum3); + + __m128 _k20 = (__m128)__lsx_vld(kptr, 0); + __m128 _k21 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k22 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k23 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k24 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k25 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k26 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r2 = __lsx_vld(r2, 0); + __m128i _r2n = __lsx_vld(r2 + 4, 0); + __m128i _r2nn = __lsx_vld(r2 + 8, 0); + + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3); + __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0); + __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1); + __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2); + __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3); + __m128 _r28 = (__m128)__lsx_vreplvei_w(_r2nn, 0); + __m128 _r29 = (__m128)__lsx_vreplvei_w(_r2nn, 1); + __m128 _r2a = (__m128)__lsx_vreplvei_w(_r2nn, 2); + __m128 _r2b = (__m128)__lsx_vreplvei_w(_r2nn, 3); + __m128 _r2c = __lsx_vreplfr2vr_s(r2[12]); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1); + _sum2 = __lsx_vfmadd_s(_k20, _r24, _sum2); + _sum3 = __lsx_vfmadd_s(_k20, _r26, _sum3); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1); + _sum2 = __lsx_vfmadd_s(_k21, _r25, _sum2); + _sum3 = __lsx_vfmadd_s(_k21, _r27, _sum3); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1); + _sum2 = __lsx_vfmadd_s(_k22, _r26, _sum2); + _sum3 = __lsx_vfmadd_s(_k22, _r28, _sum3); + _sum0 = __lsx_vfmadd_s(_k23, _r23, _sum0); + _sum1 = __lsx_vfmadd_s(_k23, _r25, _sum1); + _sum2 = __lsx_vfmadd_s(_k23, _r27, _sum2); + _sum3 = __lsx_vfmadd_s(_k23, _r29, _sum3); + _sum0 = __lsx_vfmadd_s(_k24, _r24, _sum0); + _sum1 = __lsx_vfmadd_s(_k24, _r26, _sum1); + _sum2 = __lsx_vfmadd_s(_k24, _r28, _sum2); + _sum3 = __lsx_vfmadd_s(_k24, _r2a, _sum3); + _sum0 = __lsx_vfmadd_s(_k25, _r25, _sum0); + _sum1 = __lsx_vfmadd_s(_k25, _r27, _sum1); + _sum2 = __lsx_vfmadd_s(_k25, _r29, _sum2); + _sum3 = __lsx_vfmadd_s(_k25, _r2b, _sum3); + _sum0 = __lsx_vfmadd_s(_k26, _r26, _sum0); + _sum1 = __lsx_vfmadd_s(_k26, _r28, _sum1); + _sum2 = __lsx_vfmadd_s(_k26, _r2a, _sum2); + _sum3 = __lsx_vfmadd_s(_k26, _r2c, _sum3); + + __m128 _k30 = (__m128)__lsx_vld(kptr, 0); + __m128 _k31 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k32 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k33 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k34 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k35 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k36 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r3 = __lsx_vld(r3, 0); + __m128i _r3n = __lsx_vld(r3 + 4, 0); + __m128i _r3nn = __lsx_vld(r3 + 8, 0); + + __m128 _r30 = (__m128)__lsx_vreplvei_w(_r3, 0); + __m128 _r31 = (__m128)__lsx_vreplvei_w(_r3, 1); + __m128 _r32 = (__m128)__lsx_vreplvei_w(_r3, 2); + __m128 _r33 = (__m128)__lsx_vreplvei_w(_r3, 3); + __m128 _r34 = (__m128)__lsx_vreplvei_w(_r3n, 0); + __m128 _r35 = (__m128)__lsx_vreplvei_w(_r3n, 1); + __m128 _r36 = (__m128)__lsx_vreplvei_w(_r3n, 2); + __m128 _r37 = (__m128)__lsx_vreplvei_w(_r3n, 3); + __m128 _r38 = (__m128)__lsx_vreplvei_w(_r3nn, 0); + __m128 _r39 = (__m128)__lsx_vreplvei_w(_r3nn, 1); + __m128 _r3a = (__m128)__lsx_vreplvei_w(_r3nn, 2); + __m128 _r3b = (__m128)__lsx_vreplvei_w(_r3nn, 3); + __m128 _r3c = __lsx_vreplfr2vr_s(r3[12]); + + _sum0 = __lsx_vfmadd_s(_k30, _r30, _sum0); + _sum1 = __lsx_vfmadd_s(_k30, _r32, _sum1); + _sum2 = __lsx_vfmadd_s(_k30, _r34, _sum2); + _sum3 = __lsx_vfmadd_s(_k30, _r36, _sum3); + _sum0 = __lsx_vfmadd_s(_k31, _r31, _sum0); + _sum1 = __lsx_vfmadd_s(_k31, _r33, _sum1); + _sum2 = __lsx_vfmadd_s(_k31, _r35, _sum2); + _sum3 = __lsx_vfmadd_s(_k31, _r37, _sum3); + _sum0 = __lsx_vfmadd_s(_k32, _r32, _sum0); + _sum1 = __lsx_vfmadd_s(_k32, _r34, _sum1); + _sum2 = __lsx_vfmadd_s(_k32, _r36, _sum2); + _sum3 = __lsx_vfmadd_s(_k32, _r38, _sum3); + _sum0 = __lsx_vfmadd_s(_k33, _r33, _sum0); + _sum1 = __lsx_vfmadd_s(_k33, _r35, _sum1); + _sum2 = __lsx_vfmadd_s(_k33, _r37, _sum2); + _sum3 = __lsx_vfmadd_s(_k33, _r39, _sum3); + _sum0 = __lsx_vfmadd_s(_k34, _r34, _sum0); + _sum1 = __lsx_vfmadd_s(_k34, _r36, _sum1); + _sum2 = __lsx_vfmadd_s(_k34, _r38, _sum2); + _sum3 = __lsx_vfmadd_s(_k34, _r3a, _sum3); + _sum0 = __lsx_vfmadd_s(_k35, _r35, _sum0); + _sum1 = __lsx_vfmadd_s(_k35, _r37, _sum1); + _sum2 = __lsx_vfmadd_s(_k35, _r39, _sum2); + _sum3 = __lsx_vfmadd_s(_k35, _r3b, _sum3); + _sum0 = __lsx_vfmadd_s(_k36, _r36, _sum0); + _sum1 = __lsx_vfmadd_s(_k36, _r38, _sum1); + _sum2 = __lsx_vfmadd_s(_k36, _r3a, _sum2); + _sum3 = __lsx_vfmadd_s(_k36, _r3c, _sum3); + + __m128 _k40 = (__m128)__lsx_vld(kptr, 0); + __m128 _k41 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k42 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k43 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k44 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k45 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k46 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r4 = __lsx_vld(r4, 0); + __m128i _r4n = __lsx_vld(r4 + 4, 0); + __m128i _r4nn = __lsx_vld(r4 + 8, 0); + + __m128 _r40 = (__m128)__lsx_vreplvei_w(_r4, 0); + __m128 _r41 = (__m128)__lsx_vreplvei_w(_r4, 1); + __m128 _r42 = (__m128)__lsx_vreplvei_w(_r4, 2); + __m128 _r43 = (__m128)__lsx_vreplvei_w(_r4, 3); + __m128 _r44 = (__m128)__lsx_vreplvei_w(_r4n, 0); + __m128 _r45 = (__m128)__lsx_vreplvei_w(_r4n, 1); + __m128 _r46 = (__m128)__lsx_vreplvei_w(_r4n, 2); + __m128 _r47 = (__m128)__lsx_vreplvei_w(_r4n, 3); + __m128 _r48 = (__m128)__lsx_vreplvei_w(_r4nn, 0); + __m128 _r49 = (__m128)__lsx_vreplvei_w(_r4nn, 1); + __m128 _r4a = (__m128)__lsx_vreplvei_w(_r4nn, 2); + __m128 _r4b = (__m128)__lsx_vreplvei_w(_r4nn, 3); + __m128 _r4c = __lsx_vreplfr2vr_s(r4[12]); + + _sum0 = __lsx_vfmadd_s(_k40, _r40, _sum0); + _sum1 = __lsx_vfmadd_s(_k40, _r42, _sum1); + _sum2 = __lsx_vfmadd_s(_k40, _r44, _sum2); + _sum3 = __lsx_vfmadd_s(_k40, _r46, _sum3); + _sum0 = __lsx_vfmadd_s(_k41, _r41, _sum0); + _sum1 = __lsx_vfmadd_s(_k41, _r43, _sum1); + _sum2 = __lsx_vfmadd_s(_k41, _r45, _sum2); + _sum3 = __lsx_vfmadd_s(_k41, _r47, _sum3); + _sum0 = __lsx_vfmadd_s(_k42, _r42, _sum0); + _sum1 = __lsx_vfmadd_s(_k42, _r44, _sum1); + _sum2 = __lsx_vfmadd_s(_k42, _r46, _sum2); + _sum3 = __lsx_vfmadd_s(_k42, _r48, _sum3); + _sum0 = __lsx_vfmadd_s(_k43, _r43, _sum0); + _sum1 = __lsx_vfmadd_s(_k43, _r45, _sum1); + _sum2 = __lsx_vfmadd_s(_k43, _r47, _sum2); + _sum3 = __lsx_vfmadd_s(_k43, _r49, _sum3); + _sum0 = __lsx_vfmadd_s(_k44, _r44, _sum0); + _sum1 = __lsx_vfmadd_s(_k44, _r46, _sum1); + _sum2 = __lsx_vfmadd_s(_k44, _r48, _sum2); + _sum3 = __lsx_vfmadd_s(_k44, _r4a, _sum3); + _sum0 = __lsx_vfmadd_s(_k45, _r45, _sum0); + _sum1 = __lsx_vfmadd_s(_k45, _r47, _sum1); + _sum2 = __lsx_vfmadd_s(_k45, _r49, _sum2); + _sum3 = __lsx_vfmadd_s(_k45, _r4b, _sum3); + _sum0 = __lsx_vfmadd_s(_k46, _r46, _sum0); + _sum1 = __lsx_vfmadd_s(_k46, _r48, _sum1); + _sum2 = __lsx_vfmadd_s(_k46, _r4a, _sum2); + _sum3 = __lsx_vfmadd_s(_k46, _r4c, _sum3); + + __m128 _k50 = (__m128)__lsx_vld(kptr, 0); + __m128 _k51 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k52 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k53 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k54 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k55 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k56 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r5 = __lsx_vld(r5, 0); + __m128i _r5n = __lsx_vld(r5 + 4, 0); + __m128i _r5nn = __lsx_vld(r5 + 8, 0); + + __m128 _r50 = (__m128)__lsx_vreplvei_w(_r5, 0); + __m128 _r51 = (__m128)__lsx_vreplvei_w(_r5, 1); + __m128 _r52 = (__m128)__lsx_vreplvei_w(_r5, 2); + __m128 _r53 = (__m128)__lsx_vreplvei_w(_r5, 3); + __m128 _r54 = (__m128)__lsx_vreplvei_w(_r5n, 0); + __m128 _r55 = (__m128)__lsx_vreplvei_w(_r5n, 1); + __m128 _r56 = (__m128)__lsx_vreplvei_w(_r5n, 2); + __m128 _r57 = (__m128)__lsx_vreplvei_w(_r5n, 3); + __m128 _r58 = (__m128)__lsx_vreplvei_w(_r5nn, 0); + __m128 _r59 = (__m128)__lsx_vreplvei_w(_r5nn, 1); + __m128 _r5a = (__m128)__lsx_vreplvei_w(_r5nn, 2); + __m128 _r5b = (__m128)__lsx_vreplvei_w(_r5nn, 3); + __m128 _r5c = __lsx_vreplfr2vr_s(r5[12]); + + _sum0 = __lsx_vfmadd_s(_k50, _r50, _sum0); + _sum1 = __lsx_vfmadd_s(_k50, _r52, _sum1); + _sum2 = __lsx_vfmadd_s(_k50, _r54, _sum2); + _sum3 = __lsx_vfmadd_s(_k50, _r56, _sum3); + _sum0 = __lsx_vfmadd_s(_k51, _r51, _sum0); + _sum1 = __lsx_vfmadd_s(_k51, _r53, _sum1); + _sum2 = __lsx_vfmadd_s(_k51, _r55, _sum2); + _sum3 = __lsx_vfmadd_s(_k51, _r57, _sum3); + _sum0 = __lsx_vfmadd_s(_k52, _r52, _sum0); + _sum1 = __lsx_vfmadd_s(_k52, _r54, _sum1); + _sum2 = __lsx_vfmadd_s(_k52, _r56, _sum2); + _sum3 = __lsx_vfmadd_s(_k52, _r58, _sum3); + _sum0 = __lsx_vfmadd_s(_k53, _r53, _sum0); + _sum1 = __lsx_vfmadd_s(_k53, _r55, _sum1); + _sum2 = __lsx_vfmadd_s(_k53, _r57, _sum2); + _sum3 = __lsx_vfmadd_s(_k53, _r59, _sum3); + _sum0 = __lsx_vfmadd_s(_k54, _r54, _sum0); + _sum1 = __lsx_vfmadd_s(_k54, _r56, _sum1); + _sum2 = __lsx_vfmadd_s(_k54, _r58, _sum2); + _sum3 = __lsx_vfmadd_s(_k54, _r5a, _sum3); + _sum0 = __lsx_vfmadd_s(_k55, _r55, _sum0); + _sum1 = __lsx_vfmadd_s(_k55, _r57, _sum1); + _sum2 = __lsx_vfmadd_s(_k55, _r59, _sum2); + _sum3 = __lsx_vfmadd_s(_k55, _r5b, _sum3); + _sum0 = __lsx_vfmadd_s(_k56, _r56, _sum0); + _sum1 = __lsx_vfmadd_s(_k56, _r58, _sum1); + _sum2 = __lsx_vfmadd_s(_k56, _r5a, _sum2); + _sum3 = __lsx_vfmadd_s(_k56, _r5c, _sum3); + + __m128 _k60 = (__m128)__lsx_vld(kptr, 0); + __m128 _k61 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k62 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k63 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k64 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k65 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k66 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr -= 4 * 42; + + __m128i _r6 = __lsx_vld(r6, 0); + __m128i _r6n = __lsx_vld(r6 + 4, 0); + __m128i _r6nn = __lsx_vld(r6 + 8, 0); + + __m128 _r60 = (__m128)__lsx_vreplvei_w(_r6, 0); + __m128 _r61 = (__m128)__lsx_vreplvei_w(_r6, 1); + __m128 _r62 = (__m128)__lsx_vreplvei_w(_r6, 2); + __m128 _r63 = (__m128)__lsx_vreplvei_w(_r6, 3); + __m128 _r64 = (__m128)__lsx_vreplvei_w(_r6n, 0); + __m128 _r65 = (__m128)__lsx_vreplvei_w(_r6n, 1); + __m128 _r66 = (__m128)__lsx_vreplvei_w(_r6n, 2); + __m128 _r67 = (__m128)__lsx_vreplvei_w(_r6n, 3); + __m128 _r68 = (__m128)__lsx_vreplvei_w(_r6nn, 0); + __m128 _r69 = (__m128)__lsx_vreplvei_w(_r6nn, 1); + __m128 _r6a = (__m128)__lsx_vreplvei_w(_r6nn, 2); + __m128 _r6b = (__m128)__lsx_vreplvei_w(_r6nn, 3); + __m128 _r6c = __lsx_vreplfr2vr_s(r6[12]); + + _sum0 = __lsx_vfmadd_s(_k60, _r60, _sum0); + _sum1 = __lsx_vfmadd_s(_k60, _r62, _sum1); + _sum2 = __lsx_vfmadd_s(_k60, _r64, _sum2); + _sum3 = __lsx_vfmadd_s(_k60, _r66, _sum3); + _sum0 = __lsx_vfmadd_s(_k61, _r61, _sum0); + _sum1 = __lsx_vfmadd_s(_k61, _r63, _sum1); + _sum2 = __lsx_vfmadd_s(_k61, _r65, _sum2); + _sum3 = __lsx_vfmadd_s(_k61, _r67, _sum3); + _sum0 = __lsx_vfmadd_s(_k62, _r62, _sum0); + _sum1 = __lsx_vfmadd_s(_k62, _r64, _sum1); + _sum2 = __lsx_vfmadd_s(_k62, _r66, _sum2); + _sum3 = __lsx_vfmadd_s(_k62, _r68, _sum3); + _sum0 = __lsx_vfmadd_s(_k63, _r63, _sum0); + _sum1 = __lsx_vfmadd_s(_k63, _r65, _sum1); + _sum2 = __lsx_vfmadd_s(_k63, _r67, _sum2); + _sum3 = __lsx_vfmadd_s(_k63, _r69, _sum3); + _sum0 = __lsx_vfmadd_s(_k64, _r64, _sum0); + _sum1 = __lsx_vfmadd_s(_k64, _r66, _sum1); + _sum2 = __lsx_vfmadd_s(_k64, _r68, _sum2); + _sum3 = __lsx_vfmadd_s(_k64, _r6a, _sum3); + _sum0 = __lsx_vfmadd_s(_k65, _r65, _sum0); + _sum1 = __lsx_vfmadd_s(_k65, _r67, _sum1); + _sum2 = __lsx_vfmadd_s(_k65, _r69, _sum2); + _sum3 = __lsx_vfmadd_s(_k65, _r6b, _sum3); + _sum0 = __lsx_vfmadd_s(_k66, _r66, _sum0); + _sum1 = __lsx_vfmadd_s(_k66, _r68, _sum1); + _sum2 = __lsx_vfmadd_s(_k66, _r6a, _sum2); + _sum3 = __lsx_vfmadd_s(_k66, _r6c, _sum3); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 4 * 2, 0); + __lsx_vst(_sum3, outptr0 + 4 * 3, 0); + + outptr0 += 4 * 4; + + r0 += 8; + r1 += 8; + r2 += 8; + r3 += 8; + r4 += 8; + r5 += 8; + r6 += 8; + } + for (; j < outw; j++) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + + __m128 _k00 = (__m128)__lsx_vld(kptr, 0); + __m128 _k01 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k03 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k04 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k05 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k06 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r0n = __lsx_vld(r0 + 4, 0); + + _sum0 = __lsx_vfmadd_s(_k00, (__m128)__lsx_vreplvei_w(_r0, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k01, (__m128)__lsx_vreplvei_w(_r0, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k02, (__m128)__lsx_vreplvei_w(_r0, 2), _sum0); + _sum0 = __lsx_vfmadd_s(_k03, (__m128)__lsx_vreplvei_w(_r0, 3), _sum0); + _sum0 = __lsx_vfmadd_s(_k04, (__m128)__lsx_vreplvei_w(_r0n, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k05, (__m128)__lsx_vreplvei_w(_r0n, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k06, (__m128)__lsx_vreplvei_w(_r0n, 2), _sum0); + + __m128 _k10 = (__m128)__lsx_vld(kptr, 0); + __m128 _k11 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k12 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k13 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k14 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k15 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k16 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r1 = __lsx_vld(r1, 0); + __m128i _r1n = __lsx_vld(r1 + 4, 0); + + _sum0 = __lsx_vfmadd_s(_k10, (__m128)__lsx_vreplvei_w(_r1, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k11, (__m128)__lsx_vreplvei_w(_r1, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k12, (__m128)__lsx_vreplvei_w(_r1, 2), _sum0); + _sum0 = __lsx_vfmadd_s(_k13, (__m128)__lsx_vreplvei_w(_r1, 3), _sum0); + _sum0 = __lsx_vfmadd_s(_k14, (__m128)__lsx_vreplvei_w(_r1n, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k15, (__m128)__lsx_vreplvei_w(_r1n, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k16, (__m128)__lsx_vreplvei_w(_r1n, 2), _sum0); + + __m128 _k20 = (__m128)__lsx_vld(kptr, 0); + __m128 _k21 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k22 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k23 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k24 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k25 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k26 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r2 = __lsx_vld(r2, 0); + __m128i _r2n = __lsx_vld(r2 + 4, 0); + + _sum0 = __lsx_vfmadd_s(_k20, (__m128)__lsx_vreplvei_w(_r2, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k21, (__m128)__lsx_vreplvei_w(_r2, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k22, (__m128)__lsx_vreplvei_w(_r2, 2), _sum0); + _sum0 = __lsx_vfmadd_s(_k23, (__m128)__lsx_vreplvei_w(_r2, 3), _sum0); + _sum0 = __lsx_vfmadd_s(_k24, (__m128)__lsx_vreplvei_w(_r2n, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k25, (__m128)__lsx_vreplvei_w(_r2n, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k26, (__m128)__lsx_vreplvei_w(_r2n, 2), _sum0); + + __m128 _k30 = (__m128)__lsx_vld(kptr, 0); + __m128 _k31 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k32 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k33 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k34 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k35 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k36 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r3 = __lsx_vld(r3, 0); + __m128i _r3n = __lsx_vld(r3 + 4, 0); + + _sum0 = __lsx_vfmadd_s(_k30, (__m128)__lsx_vreplvei_w(_r3, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k31, (__m128)__lsx_vreplvei_w(_r3, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k32, (__m128)__lsx_vreplvei_w(_r3, 2), _sum0); + _sum0 = __lsx_vfmadd_s(_k33, (__m128)__lsx_vreplvei_w(_r3, 3), _sum0); + _sum0 = __lsx_vfmadd_s(_k34, (__m128)__lsx_vreplvei_w(_r3n, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k35, (__m128)__lsx_vreplvei_w(_r3n, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k36, (__m128)__lsx_vreplvei_w(_r3n, 2), _sum0); + + __m128 _k40 = (__m128)__lsx_vld(kptr, 0); + __m128 _k41 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k42 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k43 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k44 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k45 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k46 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r4 = __lsx_vld(r4, 0); + __m128i _r4n = __lsx_vld(r4 + 4, 0); + + _sum0 = __lsx_vfmadd_s(_k40, (__m128)__lsx_vreplvei_w(_r4, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k41, (__m128)__lsx_vreplvei_w(_r4, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k42, (__m128)__lsx_vreplvei_w(_r4, 2), _sum0); + _sum0 = __lsx_vfmadd_s(_k43, (__m128)__lsx_vreplvei_w(_r4, 3), _sum0); + _sum0 = __lsx_vfmadd_s(_k44, (__m128)__lsx_vreplvei_w(_r4n, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k45, (__m128)__lsx_vreplvei_w(_r4n, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k46, (__m128)__lsx_vreplvei_w(_r4n, 2), _sum0); + + __m128 _k50 = (__m128)__lsx_vld(kptr, 0); + __m128 _k51 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k52 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k53 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k54 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k55 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k56 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r5 = __lsx_vld(r5, 0); + __m128i _r5n = __lsx_vld(r5 + 4, 0); + + _sum0 = __lsx_vfmadd_s(_k50, (__m128)__lsx_vreplvei_w(_r5, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k51, (__m128)__lsx_vreplvei_w(_r5, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k52, (__m128)__lsx_vreplvei_w(_r5, 2), _sum0); + _sum0 = __lsx_vfmadd_s(_k53, (__m128)__lsx_vreplvei_w(_r5, 3), _sum0); + _sum0 = __lsx_vfmadd_s(_k54, (__m128)__lsx_vreplvei_w(_r5n, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k55, (__m128)__lsx_vreplvei_w(_r5n, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k56, (__m128)__lsx_vreplvei_w(_r5n, 2), _sum0); + + __m128 _k60 = (__m128)__lsx_vld(kptr, 0); + __m128 _k61 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k62 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k63 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k64 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k65 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k66 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr -= 4 * 42; + + __m128i _r6 = __lsx_vld(r6, 0); + __m128i _r6n = __lsx_vld(r6 + 4, 0); + + _sum0 = __lsx_vfmadd_s(_k60, (__m128)__lsx_vreplvei_w(_r6, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k61, (__m128)__lsx_vreplvei_w(_r6, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k62, (__m128)__lsx_vreplvei_w(_r6, 2), _sum0); + _sum0 = __lsx_vfmadd_s(_k63, (__m128)__lsx_vreplvei_w(_r6, 3), _sum0); + _sum0 = __lsx_vfmadd_s(_k64, (__m128)__lsx_vreplvei_w(_r6n, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k65, (__m128)__lsx_vreplvei_w(_r6n, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k66, (__m128)__lsx_vreplvei_w(_r6n, 2), _sum0); + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; + + r0 += 2; + r1 += 2; + r2 += 2; + r3 += 2; + r4 += 2; + r5 += 2; + r6 += 2; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + r3 += tailstep; + r4 += tailstep; + r5 += tailstep; + r6 += tailstep; + } + } + } +} diff --git a/src/layer/loongarch/convolution_int8.h b/src/layer/loongarch/convolution_int8.h new file mode 100644 index 000000000000..22c7a8ccbe6b --- /dev/null +++ b/src/layer/loongarch/convolution_int8.h @@ -0,0 +1,82 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + int sum = 0; + + // const signed char* kptr = weight_data_int8.channel(p); + const signed char* kptr = (const signed char*)weight_data_int8 + maxk * channels * p; + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const signed char* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + signed char val = sptr[space_ofs[k]]; + signed char w = kptr[k]; + sum += val * w; + } + + kptr += maxk; + } + + outptr[j] = sum; + } + + outptr += outw; + } + } +} diff --git a/src/layer/loongarch/convolution_loongarch.cpp b/src/layer/loongarch/convolution_loongarch.cpp new file mode 100644 index 000000000000..31719b3de92b --- /dev/null +++ b/src/layer/loongarch/convolution_loongarch.cpp @@ -0,0 +1,975 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_loongarch.h" + +#include "benchmark.h" +#include "cpu.h" +#include "layer_type.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_activation.h" +#include "loongarch_usability.h" + +#include "cpu.h" + +namespace ncnn { + +#include "convolution_sgemm.h" +#include "convolution_winograd_transform.h" +#include "convolution_winograd_dot.h" +#include "convolution_1x1.h" +#include "convolution_3x3.h" + +#if NCNN_INT8 +#include "convolution_sgemm_int8.h" +#include "convolution_winograd_transform_int8.h" +#include "convolution_winograd_dot_int8.h" +#include "convolution_1x1_int8.h" +#include "convolution_3x3_int8.h" +#include "convolution_int8.h" +#endif // NCNN_INT8 + +#if __loongarch_sx +#include "convolution_pack4.h" +#include "convolution_pack1to4.h" +#include "convolution_pack4to1.h" + +#include "convolution_sgemm_pack4.h" +#include "convolution_sgemm_pack4to1.h" +#include "convolution_winograd_transform_pack4.h" +#include "convolution_winograd_dot_pack4.h" +#include "convolution_1x1_pack4.h" +#include "convolution_1x1_pack4to1.h" +#include "convolution_3x3_pack4.h" +#include "convolution_3x3_pack1to4.h" +#include "convolution_7x7_pack1to4.h" + +#if NCNN_INT8 +#include "convolution_pack8to4_int8.h" +#include "convolution_pack1to4_int8.h" +#include "convolution_pack8to1_int8.h" +#include "convolution_sgemm_pack8to4_int8.h" +#include "convolution_sgemm_pack1to4_int8.h" +#include "convolution_sgemm_pack8to1_int8.h" +#include "convolution_winograd_transform_pack4_int8.h" +#include "convolution_winograd_transform_pack8_int8.h" +#include "convolution_winograd_dot_pack8to4_int8.h" +#include "convolution_winograd_dot_pack8to1_int8.h" +#include "convolution_1x1_pack8to4_int8.h" +#include "convolution_1x1_pack1to4_int8.h" +#include "convolution_1x1_pack8to1_int8.h" +#include "convolution_3x3_pack8to4_int8.h" +#include "convolution_3x3_pack8to1_int8.h" +#endif // NCNN_INT8 +#endif // __loongarch_sx + +Convolution_loongarch::Convolution_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx + + activation = 0; +} + +static void convolution_transform_kernel_packed_lsx(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) +{ + const int maxk = kernel_w * kernel_h; + + // src = kw-kh-inch-outch + // dst = pb-pa-kw-kh-inch/pa-outch/pb + { + Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); + + weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + float* g00 = weight_data_tm.channel(q / out_elempack); + + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < elempack; i++) + { + for (int j = 0; j < out_elempack; j++) + { + const float* k00 = weight_data_r2.channel(q + j).row(p + i); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } + } +} + +int Convolution_loongarch::create_pipeline(const Option& opt) +{ + if (dynamic_weight) + return 0; + + activation = create_activation_layer(activation_type, activation_params, opt); + +#if NCNN_INT8 + if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) + { + return create_pipeline_int8_loongarch(opt); + } +#endif + + const int maxk = kernel_w * kernel_h; + const int num_input = weight_data_size / maxk / num_output; + + int elempack = 1; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + elempack = num_input % 4 == 0 ? 4 : 1; + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + +#if __loongarch_sx + // pack4 + if (elempack == 4 && out_elempack == 4) + { + if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + if ((opt.use_winograd63_convolution && num_input >= 8 && num_output >= 8 && num_input <= 64 && num_output <= 64) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution)) + conv3x3s1_winograd63_transform_kernel_pack4_lsx(weight_data, weight_winograd63_data, num_input, num_output, opt); + else if ((opt.use_winograd43_convolution && num_input >= 8 && num_output >= 8) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution)) + conv3x3s1_winograd43_transform_kernel_pack4_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt); + else // if (opt.use_winograd23_convolution) + conv3x3s1_winograd23_transform_kernel_pack4_lsx(weight_data, weight_winograd23_data, num_input, num_output, opt); + } + else + { + convolution_transform_kernel_packed_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack1ton + if (elempack == 1 && out_elempack == 4) + { + convolution_transform_kernel_packed_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + + // pack4to1 + if (elempack == 4 && out_elempack == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + convolution_transform_kernel_packed_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } +#endif // __loongarch_sx + + // pack1 + if (elempack == 1 && out_elempack == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || !opt.use_winograd23_convolution) + { + conv3x3s1_winograd43_transform_kernel_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt); + } + else if (opt.use_winograd23_convolution) + { + conv3x3s1_winograd23_transform_kernel_lsx(weight_data, weight_winograd23_data, num_input, num_output, opt); + } + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + weight_data_tm = weight_data; + } + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int Convolution_loongarch::destroy_pipeline(const Option& opt) +{ + if (activation) + { + activation->destroy_pipeline(opt); + delete activation; + activation = 0; + } + + return 0; +} + +int Convolution_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if NCNN_INT8 + if (opt.use_int8_inference && int8_scale_term) + { + return forward_int8_loongarch(bottom_blob, top_blob, opt); + } +#endif + + // flattened blob, implement as InnerProduct + if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) + { + Mat bottom_blob_3d; + if (bottom_blob.elemsize % 16 == 0) + { + bottom_blob_3d = bottom_blob; + bottom_blob_3d.dims = 3; + bottom_blob_3d.w = 1; + bottom_blob_3d.h = 1; + bottom_blob_3d.c = bottom_blob.w; + bottom_blob_3d.cstep = 1; + } + else + { + bottom_blob_3d = bottom_blob.reshape(1, 1, bottom_blob.w, opt.workspace_allocator); + } + + Mat top_blob_3d; + int ret = forward(bottom_blob_3d, top_blob_3d, opt); + if (ret != 0) + return ret; + + if (top_blob_3d.elemsize % 16 == 0) + { + top_blob = top_blob_3d; + top_blob.dims = 1; + top_blob.w = top_blob_3d.c; + top_blob.h = 1; + top_blob.c = 1; + bottom_blob_3d.cstep = top_blob_3d.c; + } + else + { + top_blob = top_blob_3d.reshape(top_blob_3d.c, opt.blob_allocator); + } + + return 0; + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + // NCNN_LOGE("Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h); + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int num_input = channels * elempack; + +#if __loongarch_sx + if (elempack == 4 && out_elempack == 4) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + if ((opt.use_winograd63_convolution && num_input >= 8 && num_output >= 8 && num_input <= 64 && num_output <= 64) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution)) + conv3x3s1_winograd63_pack4_lsx(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, opt); + else if ((opt.use_winograd43_convolution && num_input >= 8 && num_output >= 8) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution)) + conv3x3s1_winograd43_pack4_lsx(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt); + else // if (opt.use_winograd23_convolution) + conv3x3s1_winograd23_pack4_lsx(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + convolution_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + if (elempack == 1 && out_elempack == 4) + { + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv3x3s2_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv7x7s2_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + convolution_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + if (elempack == 4 && out_elempack == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + convolution_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } +#endif // __loongarch_sx + + if (elempack == 1 && out_elempack == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || !opt.use_winograd23_convolution) + { + conv3x3s1_winograd43_lsx(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt); + } + else if (opt.use_winograd23_convolution) + { + conv3x3s1_winograd23_lsx(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt); + } + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output; p++) + { + float* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + const float* kptr = (const float*)weight_data_tm + maxk * channels * p; + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + const float* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + float val = sptr[space_ofs[k]]; + float wt = kptr[k]; + sum += val * wt; + } + + kptr += maxk; + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = sum; + } + + outptr += outw; + } + } + } + } + + return 0; +} + +int Convolution_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& _weight_data = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + const int _kernel_w = _weight_data.w; + const int _kernel_h = _weight_data.h; + const int _num_output = _weight_data.c * _weight_data.elempack; + + Mat weight_data_flattened; + flatten(_weight_data, weight_data_flattened, opt); + if (weight_data_flattened.empty()) + return -100; + + // weight_data_flattened as pack1 + weight_data_flattened.w *= weight_data_flattened.elempack; + weight_data_flattened.elemsize /= weight_data_flattened.elempack; + weight_data_flattened.elempack = 1; + + Mat bias_data_flattened; + if (bias_term) + { + const Mat& _bias_data = bottom_blobs[2]; + flatten(_bias_data, bias_data_flattened, opt); + if (bias_data_flattened.empty()) + return -100; + + // bias_data_flattened as pack1 + bias_data_flattened.w *= bias_data_flattened.elempack; + bias_data_flattened.elemsize /= bias_data_flattened.elempack; + bias_data_flattened.elempack = 1; + } + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution); + + ncnn::ParamDict pd; + pd.set(0, _num_output); + pd.set(1, _kernel_w); + pd.set(11, _kernel_h); + pd.set(2, dilation_w); + pd.set(21, dilation_h); + pd.set(3, stride_w); + pd.set(31, stride_h); + pd.set(4, pad_left); + pd.set(15, pad_right); + pd.set(14, pad_top); + pd.set(16, pad_bottom); + pd.set(18, pad_value); + pd.set(5, bias_term); + pd.set(6, weight_data_flattened.w); + pd.set(8, int8_scale_term); + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + ncnn::Mat weights[2]; + weights[0] = weight_data_flattened; + weights[1] = bias_data_flattened; + + op->load_model(ncnn::ModelBinFromMatArray(weights)); + + op->create_pipeline(opt); + + op->forward(bottom_blob, top_blob, opt); + + op->destroy_pipeline(opt); + + delete op; + + return 0; +} + +#if NCNN_INT8 +static void convolution_transform_kernel_packed_int8_lsx(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) +{ + const int maxk = kernel_w * kernel_h; + + // src = kw-kh-inch-outch + // dst = pa-pb-kw-kh-inch/pa-outch/pb + { + Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); + + weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)elempack * out_elempack, elempack * out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + signed char* g00 = weight_data_tm.channel(q / out_elempack); + + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < out_elempack; i++) + { + for (int j = 0; j < elempack; j++) + { + const signed char* k00 = weight_data_r2.channel(q + i).row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } + } +} + +int Convolution_loongarch::create_pipeline_int8_loongarch(const Option& opt) +{ + const int maxk = kernel_w * kernel_h; + const int num_input = weight_data_size / maxk / num_output; + + int elempack = 1; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + elempack = num_input % 8 == 0 ? 8 : 1; + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif // __loongarch_sx + +#if __loongarch_sx + if (elempack == 8 && out_elempack == 4) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_winograd43_transform_kernel_pack8to4_int8_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + convolution_transform_kernel_packed_int8_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 1 && out_elempack == 4) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) + { + convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + convolution_transform_kernel_packed_int8_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 8 && out_elempack == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_winograd43_transform_kernel_pack8to1_int8_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt); + } + else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) + { + convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + convolution_transform_kernel_packed_int8_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } +#endif // __loongarch_sx + + if (elempack == 1 && out_elempack == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_sgemm_transform_kernel_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_winograd43_transform_kernel_int8_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + weight_data_tm = weight_data; + } + } + + scale_in_data.create(num_output); + for (int p = 0; p < num_output; p++) + { + // requantize and relu + float scale_in; + if (weight_data_int8_scales[p] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); + + scale_in_data[p] = scale_in; + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int Convolution_loongarch::forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int elembits = bottom_blob.elembits(); + + Mat bottom_blob_int8 = bottom_blob; + if (elembits != 8) + { + Option opt_q = opt; + opt_q.blob_allocator = opt.workspace_allocator; + quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); + } + + Mat bottom_blob_bordered; + make_padding(bottom_blob_int8, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + int w = bottom_blob_bordered.w; + int h = bottom_blob_bordered.h; + int channels = bottom_blob_bordered.c; + int elempack = bottom_blob_bordered.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + + bool use_int8_requantize = int8_scale_term > 100; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + if (use_int8_requantize) + out_elempack = num_output % 8 == 0 ? 8 : 1; + else + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif // __loongarch_sx + size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int num_input = channels * elempack; + + int out_elempack_int32 = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack_int32 = num_output % 4 == 0 ? 4 : 1; + } +#endif // __loongarch_sx + + Mat top_blob_int32; + top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator); + if (top_blob_int32.empty()) + return -100; + +#if __loongarch_sx + if (elempack == 8 && out_elempack_int32 == 4) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_winograd43_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + else + { + convolution_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + } + + if (elempack == 1 && out_elempack_int32 == 4) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) + { + convolution_im2col_sgemm_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + else + { + convolution_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + } + + if (elempack == 8 && out_elempack_int32 == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_winograd43_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt); + } + else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) + { + convolution_im2col_sgemm_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + else + { + convolution_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + } +#endif // __loongarch_sx + + if (elempack == 1 && out_elempack_int32 == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_winograd43_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + else + { + convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + } + + if (use_int8_requantize) + { + requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); + } + else + { + dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + + return 0; +} +#endif // NCNN_INT8 + +} // namespace ncnn diff --git a/src/layer/loongarch/convolution_loongarch.h b/src/layer/loongarch/convolution_loongarch.h new file mode 100644 index 000000000000..a84281bf7135 --- /dev/null +++ b/src/layer/loongarch/convolution_loongarch.h @@ -0,0 +1,56 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_CONVOLUTION_LOONGARCH_H +#define LAYER_CONVOLUTION_LOONGARCH_H + +#include "convolution.h" + +namespace ncnn { + +class Convolution_loongarch : virtual public Convolution +{ +public: + Convolution_loongarch(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + +protected: +#if NCNN_INT8 + int create_pipeline_int8_loongarch(const Option& opt); + int forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif + +public: + Layer* activation; + + Mat weight_data_tm; + Mat weight_sgemm_data; + Mat weight_winograd23_data; + Mat weight_winograd43_data; + Mat weight_winograd63_data; + +#if NCNN_INT8 + Mat scale_in_data; +#endif +}; + +} // namespace ncnn + +#endif // LAYER_CONVOLUTION_LOONGARCH_H diff --git a/src/layer/loongarch/convolution_pack1to4.h b/src/layer/loongarch/convolution_pack1to4.h new file mode 100644 index 000000000000..b7e0123d5edd --- /dev/null +++ b/src/layer/loongarch/convolution_pack1to4.h @@ -0,0 +1,90 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + const float* bias_data_ptr = bias_data; + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + float* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_data_ptr) + { + _sum = (__m128)__lsx_vld(bias_data_ptr + p * 4, 0); + } + + const float* kptr = (const float*)weight_data_pack1ton + maxk * channels * p * 4; + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const float* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) // 29.23 + { + __m128 _val = __lsx_vreplfr2vr_s(sptr[space_ofs[k]]); + __m128 _w = (__m128)__lsx_vld(kptr, 0); + _sum = __lsx_vfmadd_s(_w, _val, _sum); + + kptr += 4; + } + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } +} diff --git a/src/layer/loongarch/convolution_pack1to4_int8.h b/src/layer/loongarch/convolution_pack1to4_int8.h new file mode 100644 index 000000000000..b043503c2ac6 --- /dev/null +++ b/src/layer/loongarch/convolution_pack1to4_int8.h @@ -0,0 +1,87 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128i _sum = __lsx_vreplgr2vr_w(0); + + const signed char* kptr = weight_data_int8.channel(p); + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const signed char* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + __m128i _val = __lsx_vreplgr2vr_h((short)sptr[space_ofs[k]]); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val, _w16); + __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0); + + _sum = __lsx_vadd_w(_sum, _s032); + + kptr += 4; + } + } + + __lsx_vst(_sum, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } +} diff --git a/src/layer/loongarch/convolution_pack4.h b/src/layer/loongarch/convolution_pack4.h new file mode 100644 index 000000000000..66a7863f015b --- /dev/null +++ b/src/layer/loongarch/convolution_pack4.h @@ -0,0 +1,102 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + const float* bias_data_ptr = bias_data; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + float* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_data_ptr) + { + _sum = (__m128)__lsx_vld(bias_data_ptr + p * 4, 0); + } + + const float* kptr = (const float*)weight_data_pack4.channel(p); + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const float* sptr = m.row(i * stride_h) + j * stride_w * 4; + + for (int k = 0; k < maxk; k++) // 29.23 + { + const float* slptr = sptr + space_ofs[k] * 4; + + __m128 _val0 = __lsx_vreplfr2vr_s(slptr[0]); + __m128 _val1 = __lsx_vreplfr2vr_s(slptr[1]); + __m128 _val2 = __lsx_vreplfr2vr_s(slptr[2]); + __m128 _val3 = __lsx_vreplfr2vr_s(slptr[3]); + + __m128 _w0 = (__m128)__lsx_vld(kptr, 0); + __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0); + __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0); + + _sum = __lsx_vfmadd_s(_w0, _val0, _sum); + _sum = __lsx_vfmadd_s(_w1, _val1, _sum); + _sum = __lsx_vfmadd_s(_w2, _val2, _sum); + _sum = __lsx_vfmadd_s(_w3, _val3, _sum); + + kptr += 16; + } + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } +} diff --git a/src/layer/loongarch/convolution_pack4to1.h b/src/layer/loongarch/convolution_pack4to1.h new file mode 100644 index 000000000000..872759fc7f12 --- /dev/null +++ b/src/layer/loongarch/convolution_pack4to1.h @@ -0,0 +1,94 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4to1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + const float* bias_data_ptr = bias_data; + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + float* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_data_ptr) + { + sum = bias_data_ptr[p]; + } + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + const float* kptr = (const float*)weight_data_pack4to1.channel(p); + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const float* sptr = m.row(i * stride_h) + j * stride_w * 4; + + for (int k = 0; k < maxk; k++) + { + __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0); + __m128 _w = (__m128)__lsx_vld(kptr, 0); + _sum = __lsx_vfmadd_s(_w, _val, _sum); + + kptr += 4; + } + } + + sum += __lsx_reduce_fadd_s(_sum); + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = sum; + } + + outptr += outw; + } + } +} diff --git a/src/layer/loongarch/convolution_pack8to1_int8.h b/src/layer/loongarch/convolution_pack8to1_int8.h new file mode 100644 index 000000000000..c7463a472b6f --- /dev/null +++ b/src/layer/loongarch/convolution_pack8to1_int8.h @@ -0,0 +1,87 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128i _sum = __lsx_vreplgr2vr_w(0); + + const signed char* kptr = weight_data_int8.channel(p); + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const signed char* sptr = m.row(i * stride_h) + j * stride_w * 8; + + for (int k = 0; k < maxk; k++) + { + __m128i _val = __lsx_vld(sptr + space_ofs[k] * 8, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val16, _w16); + + _sum = __lsx_vadd_w(_sum, __lsx_vhaddw_w_h(_s0, _s0)); + + kptr += 8; + } + } + + outptr[j] = __lsx_reduce_add_w(_sum); + } + + outptr += outw; + } + } +} diff --git a/src/layer/loongarch/convolution_pack8to4_int8.h b/src/layer/loongarch/convolution_pack8to4_int8.h new file mode 100644 index 000000000000..00d90387bbed --- /dev/null +++ b/src/layer/loongarch/convolution_pack8to4_int8.h @@ -0,0 +1,120 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + const signed char* kptr = weight_data_int8.channel(p); + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const signed char* sptr = m.row(i * stride_h) + j * stride_w * 8; + + for (int k = 0; k < maxk; k++) + { + __m128i _val = __lsx_vld(sptr + space_ofs[k] * 8, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _w23 = __lsx_vld(kptr + 16, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _extw23 = __lsx_vslti_b(_w23, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + __m128i _w2 = __lsx_vilvl_b(_extw23, _w23); + __m128i _w3 = __lsx_vilvh_b(_extw23, _w23); + + __m128i _s0 = __lsx_vmul_h(_val16, _w0); + __m128i _s1 = __lsx_vmul_h(_val16, _w1); + __m128i _s2 = __lsx_vmul_h(_val16, _w2); + __m128i _s3 = __lsx_vmul_h(_val16, _w3); + + _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0)); + _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1)); + _sum2 = __lsx_vadd_w(_sum2, __lsx_vhaddw_w_h(_s2, _s2)); + _sum3 = __lsx_vadd_w(_sum3, __lsx_vhaddw_w_h(_s3, _s3)); + + kptr += 32; + } + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum1, _sum0); + _tmp1 = __lsx_vilvl_w(_sum3, _sum2); + _tmp2 = __lsx_vilvh_w(_sum1, _sum0); + _tmp3 = __lsx_vilvh_w(_sum3, _sum2); + _sum0 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum1 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum2 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum3 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + + _sum0 = __lsx_vadd_w(_sum0, _sum2); + + __lsx_vst(_sum0, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } +} diff --git a/src/layer/loongarch/convolution_sgemm.h b/src/layer/loongarch/convolution_sgemm.h new file mode 100644 index 000000000000..7b74ceac14b2 --- /dev/null +++ b/src/layer/loongarch/convolution_sgemm.h @@ -0,0 +1,650 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + const float* bias = _bias; + + // permute + Mat tmp; + if (size >= 4) + tmp.create(4 * maxk, inch, size / 4 + size % 4, 4u, 1, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 4u, 1, opt.workspace_allocator); + { + int nn_size = size / 4; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = ii * 4; + + float* tmpptr = tmp.channel(i / 4); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { +#if __loongarch_sx + __lsx_vst(__lsx_vld(img0, 0), tmpptr, 0); +#else + tmpptr[0] = img0[0]; + tmpptr[1] = img0[1]; + tmpptr[2] = img0[2]; + tmpptr[3] = img0[3]; +#endif + img0 += size; + tmpptr += 4; + } + } + } + + int remain_size_start = nn_size * 4; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + float* tmpptr = tmp.channel(i / 4 + i % 4); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + img0 += size; + tmpptr += 1; + } + } + } + } + +#if __loongarch_sx + int nn_outch = outch >> 3; + int remain_outch_start = nn_outch << 3; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 8; + + float* outptr0 = top_blob.channel(p); + float* outptr1 = top_blob.channel(p + 1); + float* outptr2 = top_blob.channel(p + 2); + float* outptr3 = top_blob.channel(p + 3); + float* outptr4 = top_blob.channel(p + 4); + float* outptr5 = top_blob.channel(p + 5); + float* outptr6 = top_blob.channel(p + 6); + float* outptr7 = top_blob.channel(p + 7); + + const float zeros[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* biasptr = bias ? bias + p : zeros; + + int i = 0; + for (; i + 3 < size; i += 4) + { + const float* tmpptr = tmp.channel(i / 4); + const float* kptr = kernel.channel(p / 8); + + int nn = inch * maxk; // inch always > 0 + + __m128 _sum0 = __lsx_vreplfr2vr_s(biasptr[0]); + __m128 _sum1 = __lsx_vreplfr2vr_s(biasptr[1]); + __m128 _sum2 = __lsx_vreplfr2vr_s(biasptr[2]); + __m128 _sum3 = __lsx_vreplfr2vr_s(biasptr[3]); + __m128 _sum4 = __lsx_vreplfr2vr_s(biasptr[4]); + __m128 _sum5 = __lsx_vreplfr2vr_s(biasptr[5]); + __m128 _sum6 = __lsx_vreplfr2vr_s(biasptr[6]); + __m128 _sum7 = __lsx_vreplfr2vr_s(biasptr[7]); + + for (int q = 0; q < nn; q++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr + 32); + __m128 _val = (__m128)__lsx_vld(tmpptr, 0); + __m128i _w0123 = __lsx_vld(kptr, 0); + __m128i _w4567 = __lsx_vld(kptr + 4, 0); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3); + _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 0), _val, _sum4); + _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 1), _val, _sum5); + _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 2), _val, _sum6); + _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 3), _val, _sum7); + + tmpptr += 4; + kptr += 8; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr1, 0); + __lsx_vst(_sum2, outptr2, 0); + __lsx_vst(_sum3, outptr3, 0); + __lsx_vst(_sum4, outptr4, 0); + __lsx_vst(_sum5, outptr5, 0); + __lsx_vst(_sum6, outptr6, 0); + __lsx_vst(_sum7, outptr7, 0); + + outptr0 += 4; + outptr1 += 4; + outptr2 += 4; + outptr3 += 4; + outptr4 += 4; + outptr5 += 4; + outptr6 += 4; + outptr7 += 4; + } + for (; i < size; i++) + { + const float* tmpptr = tmp.channel(i / 4 + i % 4); + const float* kptr = kernel.channel(p / 8); + + int nn = inch * maxk; // inch always > 0 + + float sum0 = biasptr[0]; + float sum1 = biasptr[1]; + float sum2 = biasptr[2]; + float sum3 = biasptr[3]; + float sum4 = biasptr[4]; + float sum5 = biasptr[5]; + float sum6 = biasptr[6]; + float sum7 = biasptr[7]; + + for (int q = 0; q < nn; q++) + { + sum0 += tmpptr[0] * kptr[0]; + sum1 += tmpptr[0] * kptr[1]; + sum2 += tmpptr[0] * kptr[2]; + sum3 += tmpptr[0] * kptr[3]; + sum4 += tmpptr[0] * kptr[4]; + sum5 += tmpptr[0] * kptr[5]; + sum6 += tmpptr[0] * kptr[6]; + sum7 += tmpptr[0] * kptr[7]; + tmpptr++; + kptr += 8; + } + + outptr0[0] = sum0; + outptr1[0] = sum1; + outptr2[0] = sum2; + outptr3[0] = sum3; + outptr4[0] = sum4; + outptr5[0] = sum5; + outptr6[0] = sum6; + outptr7[0] = sum7; + + outptr0++; + outptr1++; + outptr2++; + outptr3++; + outptr4++; + outptr5++; + outptr6++; + outptr7++; + } + } + + nn_outch = (outch - remain_outch_start) >> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = remain_outch_start + pp * 4; + + float* outptr0 = top_blob.channel(p); + float* outptr1 = top_blob.channel(p + 1); + float* outptr2 = top_blob.channel(p + 2); + float* outptr3 = top_blob.channel(p + 3); + + const float zeros[4] = {0.f, 0.f, 0.f, 0.f}; + const float* biasptr = bias ? bias + p : zeros; + + int i = 0; + for (; i + 3 < size; i += 4) + { + const float* tmpptr = tmp.channel(i / 4); + const float* kptr = kernel.channel(p / 8 + (p % 8) / 4); + + int nn = inch * maxk; // inch always > 0 + + __m128 _sum0 = __lsx_vreplfr2vr_s(biasptr[0]); + __m128 _sum1 = __lsx_vreplfr2vr_s(biasptr[1]); + __m128 _sum2 = __lsx_vreplfr2vr_s(biasptr[2]); + __m128 _sum3 = __lsx_vreplfr2vr_s(biasptr[3]); + + for (int q = 0; q < nn; q++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr + 16); + __m128 _val = (__m128)__lsx_vld(tmpptr, 0); + __m128i _w0123 = __lsx_vld(kptr, 0); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3); + + tmpptr += 4; + kptr += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr1, 0); + __lsx_vst(_sum2, outptr2, 0); + __lsx_vst(_sum3, outptr3, 0); + + outptr0 += 4; + outptr1 += 4; + outptr2 += 4; + outptr3 += 4; + } + for (; i < size; i++) + { + const float* tmpptr = tmp.channel(i / 4 + i % 4); + const float* kptr = kernel.channel(p / 8 + (p % 8) / 4); + + int nn = inch * maxk; // inch always > 0 + + float sum0 = biasptr[0]; + float sum1 = biasptr[1]; + float sum2 = biasptr[2]; + float sum3 = biasptr[3]; + + for (int q = 0; q < nn; q++) + { + sum0 += tmpptr[0] * kptr[0]; + sum1 += tmpptr[0] * kptr[1]; + sum2 += tmpptr[0] * kptr[2]; + sum3 += tmpptr[0] * kptr[3]; + tmpptr++; + kptr += 4; + } + + outptr0[0] = sum0; + outptr1[0] = sum1; + outptr2[0] = sum2; + outptr3[0] = sum3; + + outptr0++; + outptr1++; + outptr2++; + outptr3++; + } + } + + remain_outch_start += nn_outch << 2; +#else // __loongarch_sx + int nn_outch = outch >> 1; + int remain_outch_start = nn_outch << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 2; + + float* outptr0 = top_blob.channel(p); + float* outptr1 = top_blob.channel(p + 1); + + const float zeros[2] = {0.f, 0.f}; + const float* biasptr = bias ? bias + p : zeros; + + int i = 0; + for (; i + 3 < size; i += 4) + { + const float* tmpptr = tmp.channel(i / 4); + const float* kptr = kernel.channel(p / 2); + + int nn = inch * maxk; // inch always > 0 + + float sum00 = biasptr[0]; + float sum01 = biasptr[0]; + float sum02 = biasptr[0]; + float sum03 = biasptr[0]; + float sum10 = biasptr[1]; + float sum11 = biasptr[1]; + float sum12 = biasptr[1]; + float sum13 = biasptr[1]; + + for (int q = 0; q < nn; q++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr + 8); + float k0 = kptr[0]; + float k1 = kptr[1]; + sum00 += tmpptr[0] * k0; + sum01 += tmpptr[1] * k0; + sum02 += tmpptr[2] * k0; + sum03 += tmpptr[3] * k0; + sum10 += tmpptr[0] * k1; + sum11 += tmpptr[1] * k1; + sum12 += tmpptr[2] * k1; + sum13 += tmpptr[3] * k1; + tmpptr += 4; + kptr += 2; + } + + outptr0[0] = sum00; + outptr0[1] = sum01; + outptr0[2] = sum02; + outptr0[3] = sum03; + outptr1[0] = sum10; + outptr1[1] = sum11; + outptr1[2] = sum12; + outptr1[3] = sum13; + + outptr0 += 4; + outptr1 += 4; + } + for (; i < size; i++) + { + const float* tmpptr = tmp.channel(i / 4 + i % 4); + const float* kptr = kernel.channel(p / 2); + + int nn = inch * maxk; // inch always > 0 + + float sum0 = biasptr[0]; + float sum1 = biasptr[1]; + + for (int q = 0; q < nn; q++) + { + __builtin_prefetch(tmpptr + 4); + __builtin_prefetch(kptr + 8); + sum0 += tmpptr[0] * kptr[0]; + sum1 += tmpptr[0] * kptr[1]; + tmpptr++; + kptr += 2; + } + + outptr0[0] = sum0; + outptr1[0] = sum1; + + outptr0++; + outptr1++; + } + } +#endif // __loongarch_sx + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + float* outptr0 = top_blob.channel(p); + + const float bias0 = bias ? bias[p] : 0.f; + + int i = 0; + for (; i + 3 < size; i += 4) + { + const float* tmpptr = tmp.channel(i / 4); +#if __loongarch_sx + const float* kptr = kernel.channel(p / 8 + (p % 8) / 4 + p % 4); +#else + const float* kptr = kernel.channel(p / 2 + p % 2); +#endif + + int nn = inch * maxk; // inch always > 0 + +#if __loongarch_sx + __m128 _sum0 = __lsx_vreplfr2vr_s(bias0); + + for (int q = 0; q < nn; q++) + { + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vld(tmpptr, 0), __lsx_vreplfr2vr_s(kptr[0]), _sum0); + tmpptr += 4; + kptr++; + } + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; +#else + float sum0 = bias0; + float sum1 = bias0; + float sum2 = bias0; + float sum3 = bias0; + + for (int q = 0; q < nn; q++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr + 4); + sum0 += tmpptr[0] * kptr[0]; + sum1 += tmpptr[1] * kptr[0]; + sum2 += tmpptr[2] * kptr[0]; + sum3 += tmpptr[3] * kptr[0]; + tmpptr += 4; + kptr++; + } + + outptr0[0] = sum0; + outptr0[1] = sum1; + outptr0[2] = sum2; + outptr0[3] = sum3; + + outptr0 += 4; +#endif // __loongarch_sx + } + for (; i < size; i++) + { + const float* tmpptr = tmp.channel(i / 4 + i % 4); +#if __loongarch_sx + const float* kptr = kernel.channel(p / 8 + (p % 8) / 4 + p % 4); +#else + const float* kptr = kernel.channel(p / 2 + p % 2); +#endif + + int nn = inch * maxk; // inch always > 0 + + float sum0 = bias0; + + for (int q = 0; q < nn; q++) + { + sum0 += tmpptr[0] * kptr[0]; + tmpptr++; + kptr++; + } + + outptr0[0] = sum0; + + outptr0++; + } + } +} + +static void convolution_im2col_sgemm_transform_kernel_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // interleave + // src = maxk-inch-outch + // dst = 8b-maxk-inch-outch/8b + Mat kernel = _kernel.reshape(maxk, inch, outch); +#if __loongarch_sx + kernel_tm.create(8 * maxk, inch, outch / 8 + (outch % 8) / 4 + outch % 4); +#else + kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2); +#endif + + int q = 0; +#if __loongarch_sx + for (; q + 7 < outch; q += 8) + { + const Mat k0 = kernel.channel(q); + const Mat k1 = kernel.channel(q + 1); + const Mat k2 = kernel.channel(q + 2); + const Mat k3 = kernel.channel(q + 3); + const Mat k4 = kernel.channel(q + 4); + const Mat k5 = kernel.channel(q + 5); + const Mat k6 = kernel.channel(q + 6); + const Mat k7 = kernel.channel(q + 7); + + float* g00 = kernel_tm.channel(q / 8); + + for (int p = 0; p < inch; p++) + { + const float* k00 = k0.row(p); + const float* k10 = k1.row(p); + const float* k20 = k2.row(p); + const float* k30 = k3.row(p); + const float* k40 = k4.row(p); + const float* k50 = k5.row(p); + const float* k60 = k6.row(p); + const float* k70 = k7.row(p); + + for (int k = 0; k < maxk; k++) + { + g00[0] = k00[k]; + g00[1] = k10[k]; + g00[2] = k20[k]; + g00[3] = k30[k]; + g00[4] = k40[k]; + g00[5] = k50[k]; + g00[6] = k60[k]; + g00[7] = k70[k]; + + g00 += 8; + } + } + } + for (; q + 3 < outch; q += 4) + { + const Mat k0 = kernel.channel(q); + const Mat k1 = kernel.channel(q + 1); + const Mat k2 = kernel.channel(q + 2); + const Mat k3 = kernel.channel(q + 3); + + float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4); + + for (int p = 0; p < inch; p++) + { + const float* k00 = k0.row(p); + const float* k10 = k1.row(p); + const float* k20 = k2.row(p); + const float* k30 = k3.row(p); + + for (int k = 0; k < maxk; k++) + { + g00[0] = k00[k]; + g00[1] = k10[k]; + g00[2] = k20[k]; + g00[3] = k30[k]; + + g00 += 4; + } + } + } +#else + for (; q + 1 < outch; q += 2) + { + const Mat k0 = kernel.channel(q); + const Mat k1 = kernel.channel(q + 1); + + float* g00 = kernel_tm.channel(q / 2); + + for (int p = 0; p < inch; p++) + { + const float* k00 = k0.row(p); + const float* k10 = k1.row(p); + + for (int k = 0; k < maxk; k++) + { + g00[0] = k00[k]; + g00[1] = k10[k]; + + g00 += 2; + } + } + } +#endif // __loongarch_sx + for (; q < outch; q++) + { + const Mat k0 = kernel.channel(q); + +#if __loongarch_sx + float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + q % 4); +#else + float* g00 = kernel_tm.channel(q / 2 + q % 2); +#endif + + for (int p = 0; p < inch; p++) + { + const float* k00 = k0.row(p); + + for (int k = 0; k < maxk; k++) + { + g00[0] = k00[k]; + + g00 += 1; + } + } + } +} + +static void convolution_im2col_sgemm_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); + { + const int gap = w * stride_h - outw * stride_w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + float* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const float* sptr = img.row(dilation_h * u) + dilation_w * v; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + ptr[0] = sptr[0]; + + sptr += stride_w; + ptr += 1; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_lsx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/loongarch/convolution_sgemm_int8.h b/src/layer/loongarch/convolution_sgemm_int8.h new file mode 100644 index 000000000000..98f47760901f --- /dev/null +++ b/src/layer/loongarch/convolution_sgemm_int8.h @@ -0,0 +1,800 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + // permute + Mat tmp; +#if __loongarch_sx + if (inch >= 4) + { + if (size >= 2) + tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator); + else + tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator); + } + else +#endif // __loongarch_sx + { + if (size >= 2) + tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator); + } + { + int remain_size_start = 0; + int nn_size = (size - remain_size_start) >> 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 2; + + signed char* tmpptr = tmp.channel(i / 2); + + int q = 0; +#if __loongarch_sx + for (; q + 3 < inch; q += 4) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i; + const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i; + const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img1[0]; + tmpptr[2] = img2[0]; + tmpptr[3] = img3[0]; + tmpptr[4] = img0[1]; + tmpptr[5] = img1[1]; + tmpptr[6] = img2[1]; + tmpptr[7] = img3[1]; + tmpptr += 8; + + img0 += size; + img1 += size; + img2 += size; + img3 += size; + } + } +#endif // __loongarch_sx + for (; q < inch; q++) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img0[1]; + + tmpptr += 2; + + img0 += size; + } + } + } + + remain_size_start += nn_size << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + signed char* tmpptr = tmp.channel(i / 2 + i % 2); + + int q = 0; +#if __loongarch_sx + for (; q + 3 < inch; q += 4) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i; + const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i; + const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img1[0]; + tmpptr[2] = img2[0]; + tmpptr[3] = img3[0]; + tmpptr += 4; + + img0 += size; + img1 += size; + img2 += size; + img3 += size; + } + } +#endif // __loongarch_sx + for (; q < inch; q++) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + + tmpptr += 1; + + img0 += size; + } + } + } + } + +#if __loongarch_sx + int nn_outch = outch >> 2; + int remain_outch_start = nn_outch << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 4; + + int* outptr0 = top_blob.channel(p); + int* outptr1 = top_blob.channel(p + 1); + int* outptr2 = top_blob.channel(p + 2); + int* outptr3 = top_blob.channel(p + 3); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p / 4); + + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + __m128i _sum00 = __lsx_vreplgr2vr_w(0); + __m128i _sum10 = __lsx_vreplgr2vr_w(0); + + if (nn4 > 0) + { + __m128i _sum01 = __lsx_vreplgr2vr_w(0); + __m128i _sum02 = __lsx_vreplgr2vr_w(0); + __m128i _sum03 = __lsx_vreplgr2vr_w(0); + __m128i _sum11 = __lsx_vreplgr2vr_w(0); + __m128i _sum12 = __lsx_vreplgr2vr_w(0); + __m128i _sum13 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val01 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _val0 = __lsx_vilvl_d(_val01, _val01); + __m128i _val1 = __lsx_vilvh_d(_val01, _val01); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + + __m128i _s00 = __lsx_vmul_h(_val0, _w0); + __m128i _s01 = __lsx_vmul_h(_val0, _w1); + __m128i _s10 = __lsx_vmul_h(_val1, _w0); + __m128i _s11 = __lsx_vmul_h(_val1, _w1); + + __m128i _exts00 = __lsx_vslti_h(_s00, 0); + __m128i _exts01 = __lsx_vslti_h(_s01, 0); + __m128i _exts10 = __lsx_vslti_h(_s10, 0); + __m128i _exts11 = __lsx_vslti_h(_s11, 0); + __m128i _s00l = __lsx_vilvl_h(_exts00, _s00); + __m128i _s00h = __lsx_vilvh_h(_exts00, _s00); + __m128i _s01l = __lsx_vilvl_h(_exts01, _s01); + __m128i _s01h = __lsx_vilvh_h(_exts01, _s01); + __m128i _s10l = __lsx_vilvl_h(_exts10, _s10); + __m128i _s10h = __lsx_vilvh_h(_exts10, _s10); + __m128i _s11l = __lsx_vilvl_h(_exts11, _s11); + __m128i _s11h = __lsx_vilvh_h(_exts11, _s11); + + _sum00 = __lsx_vadd_w(_sum00, _s00l); + _sum01 = __lsx_vadd_w(_sum01, _s00h); + _sum02 = __lsx_vadd_w(_sum02, _s01l); + _sum03 = __lsx_vadd_w(_sum03, _s01h); + _sum10 = __lsx_vadd_w(_sum10, _s10l); + _sum11 = __lsx_vadd_w(_sum11, _s10h); + _sum12 = __lsx_vadd_w(_sum12, _s11l); + _sum13 = __lsx_vadd_w(_sum13, _s11h); + + tmpptr += 8; + kptr += 16; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum01, _sum00); + _tmp1 = __lsx_vilvl_w(_sum03, _sum02); + _tmp2 = __lsx_vilvh_w(_sum01, _sum00); + _tmp3 = __lsx_vilvh_w(_sum03, _sum02); + _sum00 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum01 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum02 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum03 = __lsx_vilvh_d(_tmp3, _tmp2); + } + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum11, _sum10); + _tmp1 = __lsx_vilvl_w(_sum13, _sum12); + _tmp2 = __lsx_vilvh_w(_sum11, _sum10); + _tmp3 = __lsx_vilvh_w(_sum13, _sum12); + _sum10 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum11 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum12 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum13 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum00 = __lsx_vadd_w(_sum00, _sum01); + _sum02 = __lsx_vadd_w(_sum02, _sum03); + _sum10 = __lsx_vadd_w(_sum10, _sum11); + _sum12 = __lsx_vadd_w(_sum12, _sum13); + + _sum00 = __lsx_vadd_w(_sum00, _sum02); + _sum10 = __lsx_vadd_w(_sum10, _sum12); + } + + int j = 0; + for (; j < nn1; j++) + { + __m128i _val0 = __lsx_vreplgr2vr_h(tmpptr[0]); + __m128i _val1 = __lsx_vreplgr2vr_h(tmpptr[1]); + __m128i _val = __lsx_vilvl_d(_val1, _val0); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + _w16 = __lsx_vilvl_d(_w16, _w16); + + __m128i _s0 = __lsx_vmul_h(_val, _w16); + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + + _sum00 = __lsx_vadd_w(_sum00, _s0l); + _sum10 = __lsx_vadd_w(_sum10, _s0h); + + tmpptr += 2; + kptr += 4; + } + + int sum[8]; + __lsx_vst(_sum00, sum, 0); + __lsx_vst(_sum10, sum + 4, 0); + + outptr0[0] = sum[0]; + outptr1[0] = sum[1]; + outptr2[0] = sum[2]; + outptr3[0] = sum[3]; + outptr0[1] = sum[4]; + outptr1[1] = sum[5]; + outptr2[1] = sum[6]; + outptr3[1] = sum[7]; + outptr0 += 2; + outptr1 += 2; + outptr2 += 2; + outptr3 += 2; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p / 4); + + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + + if (nn4 > 0) + { + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + _val16 = __lsx_vilvl_d(_val16, _val16); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + + __m128i _s0 = __lsx_vmul_h(_val16, _w0); + __m128i _s1 = __lsx_vmul_h(_val16, _w1); + + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _exts1 = __lsx_vslti_h(_s1, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + __m128i _s1l = __lsx_vilvl_h(_exts1, _s1); + __m128i _s1h = __lsx_vilvh_h(_exts1, _s1); + + _sum0 = __lsx_vadd_w(_sum0, _s0l); + _sum1 = __lsx_vadd_w(_sum1, _s0h); + _sum2 = __lsx_vadd_w(_sum2, _s1l); + _sum3 = __lsx_vadd_w(_sum3, _s1h); + + tmpptr += 4; + kptr += 16; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum1, _sum0); + _tmp1 = __lsx_vilvl_w(_sum3, _sum2); + _tmp2 = __lsx_vilvh_w(_sum1, _sum0); + _tmp3 = __lsx_vilvh_w(_sum3, _sum2); + _sum0 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum1 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum2 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum3 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + _sum0 = __lsx_vadd_w(_sum0, _sum2); + } + int j = 0; + for (; j < nn1; j++) + { + __m128i _val = __lsx_vreplgr2vr_h(tmpptr[0]); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val, _w16); + __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0); + + _sum0 = __lsx_vadd_w(_sum0, _s032); + + tmpptr += 1; + kptr += 4; + } + + int sum[4]; + __lsx_vst(_sum0, sum, 0); + + outptr0[0] = sum[0]; + outptr1[0] = sum[1]; + outptr2[0] = sum[2]; + outptr3[0] = sum[3]; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } +#else // __loongarch_sx + int nn_outch = outch >> 1; + int remain_outch_start = nn_outch << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 2; + + int* outptr0 = top_blob.channel(p); + int* outptr1 = top_blob.channel(p + 1); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p / 2); + + int sum00 = 0; + int sum01 = 0; + int sum10 = 0; + int sum11 = 0; + + int nn1 = inch * maxk; + + int j = 0; + for (; j < nn1; j++) + { + signed char val0 = tmpptr[0]; + signed char val1 = tmpptr[1]; + signed char w0 = kptr[0]; + signed char w1 = kptr[1]; + + sum00 += val0 * w0; + sum01 += val1 * w0; + sum10 += val0 * w1; + sum11 += val1 * w1; + + tmpptr += 2; + kptr += 2; + } + + outptr0[0] = sum00; + outptr0[1] = sum01; + outptr1[0] = sum10; + outptr1[1] = sum11; + outptr0 += 2; + outptr1 += 2; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p / 2); + + int sum00 = 0; + int sum10 = 0; + + int nn1 = inch * maxk; + + int j = 0; + for (; j < nn1; j++) + { + signed char val0 = tmpptr[0]; + signed char w0 = kptr[0]; + signed char w1 = kptr[1]; + + sum00 += val0 * w0; + sum10 += val0 * w1; + + tmpptr += 1; + kptr += 2; + } + + outptr0[0] = sum00; + outptr1[0] = sum10; + outptr0 += 1; + outptr1 += 1; + } + } +#endif // __loongarch_sx + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + int* outptr0 = top_blob.channel(p); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); +#if __loongarch_sx + const signed char* kptr = kernel.channel(p / 4 + p % 4); +#else + const signed char* kptr = kernel.channel(p / 2 + p % 2); +#endif + + int sum0 = 0; + int sum1 = 0; + +#if __loongarch_sx + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + if (nn4 > 0) + { + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + _w16 = __lsx_vilvl_d(_w16, _w16); + + __m128i _s0 = __lsx_vmul_h(_val16, _w16); + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + + _sum0 = __lsx_vadd_w(_sum0, _s0l); + _sum1 = __lsx_vadd_w(_sum1, _s0h); + + tmpptr += 8; + kptr += 4; + } + + sum0 = __lsx_reduce_add_w(_sum0); + sum1 = __lsx_reduce_add_w(_sum1); + } +#else + int nn1 = inch * maxk; +#endif // __loongarch_sx + + int j = 0; + for (; j < nn1; j++) + { + signed char val0 = tmpptr[0]; + signed char val1 = tmpptr[1]; + signed char w = kptr[0]; + + sum0 += val0 * w; + sum1 += val1 * w; + + tmpptr += 2; + kptr += 1; + } + + outptr0[0] = sum0; + outptr0[1] = sum1; + outptr0 += 2; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); +#if __loongarch_sx + const signed char* kptr = kernel.channel(p / 4 + p % 4); +#else + const signed char* kptr = kernel.channel(p / 2 + p % 2); +#endif + + int sum = 0; + +#if __loongarch_sx + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + if (nn4 > 0) + { + __m128i _sum = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val16, _w16); + __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0); + + _sum = __lsx_vadd_w(_sum, _s032); + + tmpptr += 4; + kptr += 4; + } + + sum = __lsx_reduce_add_w(_sum); + } +#else + int nn1 = inch * maxk; +#endif // __loongarch_sx + + int j = 0; + for (; j < nn1; j++) + { + signed char val = tmpptr[0]; + signed char w = kptr[0]; + + sum += val * w; + + tmpptr += 1; + kptr += 1; + } + + outptr0[0] = sum; + outptr0 += 1; + } + } +} + +static void convolution_im2col_sgemm_transform_kernel_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // interleave + // src = maxk-inch-outch + // dst = 4a-4b-maxk-inch/4a-outch/4b + Mat kernel = _kernel.reshape(maxk, inch, outch); +#if __loongarch_sx + if (outch >= 4) + { + if (inch >= 4) + kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4 + outch % 4, (size_t)1u); + else + kernel_tm.create(4 * maxk, inch, outch / 4 + outch % 4, (size_t)1u); + } +#else + if (outch >= 2) + { + kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2, (size_t)1u); + } +#endif // __loongarch_sx + else + { +#if __loongarch_sx + if (inch >= 4) + kernel_tm.create(4 * maxk, inch / 4 + inch % 4, outch, (size_t)1u); + else +#endif // __loongarch_sx + { + kernel_tm.create(1 * maxk, inch, outch, (size_t)1u); + } + } + + int q = 0; +#if __loongarch_sx + for (; q + 3 < outch; q += 4) + { + signed char* g00 = kernel_tm.channel(q / 4); + + int p = 0; + for (; p + 3 < inch; p += 4) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + const signed char* k00 = kernel.channel(q + i).row(p + j); + g00[0] = k00[k]; + g00++; + } + } + } + } + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + const signed char* k00 = kernel.channel(q + i).row(p); + g00[0] = k00[k]; + g00++; + } + } + } + } +#else // __loongarch_sx + for (; q + 1 < outch; q += 2) + { + signed char* g00 = kernel_tm.channel(q / 2); + + int p = 0; + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 2; i++) + { + const signed char* k00 = kernel.channel(q + i).row(p); + g00[0] = k00[k]; + g00++; + } + } + } + } +#endif // __loongarch_sx + for (; q < outch; q++) + { +#if __loongarch_sx + signed char* g00 = kernel_tm.channel(q / 4 + q % 4); +#else + signed char* g00 = kernel_tm.channel(q / 2 + q % 2); +#endif + + int p = 0; +#if __loongarch_sx + for (; p + 3 < inch; p += 4) + { + for (int k = 0; k < maxk; k++) + { + for (int j = 0; j < 4; j++) + { + const signed char* k00 = kernel.channel(q).row(p + j); + g00[0] = k00[k]; + g00++; + } + } + } +#endif // __loongarch_sx + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k00 = kernel.channel(q).row(p); + g00[0] = k00[k]; + g00++; + } + } + } +} + +static void convolution_im2col_sgemm_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator); + { + const int gap = w * stride_h - outw * stride_w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + signed char* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const signed char* sptr = img.row(dilation_h * u) + dilation_w * v; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j + 3 < outw; j += 4) + { + ptr[0] = sptr[0]; + ptr[1] = sptr[stride_w]; + ptr[2] = sptr[stride_w * 2]; + ptr[3] = sptr[stride_w * 3]; + + sptr += stride_w * 4; + ptr += 4; + } + for (; j + 1 < outw; j += 2) + { + ptr[0] = sptr[0]; + ptr[1] = sptr[stride_w]; + + sptr += stride_w * 2; + ptr += 2; + } + for (; j < outw; j++) + { + ptr[0] = sptr[0]; + + sptr += stride_w; + ptr += 1; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_int8_lsx(bottom_im2col, top_blob, kernel, opt); +} diff --git a/src/layer/loongarch/convolution_sgemm_pack1to4_int8.h b/src/layer/loongarch/convolution_sgemm_pack1to4_int8.h new file mode 100644 index 000000000000..3429bfae5fa6 --- /dev/null +++ b/src/layer/loongarch/convolution_sgemm_pack1to4_int8.h @@ -0,0 +1,481 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_pack1to4_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + // permute + Mat tmp; + if (inch >= 4) + { + if (size >= 2) + tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator); + else + tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator); + } + else + { + if (size >= 2) + tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator); + } + { + int remain_size_start = 0; + int nn_size = (size - remain_size_start) >> 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 2; + + signed char* tmpptr = tmp.channel(i / 2); + + int q = 0; + for (; q + 3 < inch; q += 4) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i; + const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i; + const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img1[0]; + tmpptr[2] = img2[0]; + tmpptr[3] = img3[0]; + tmpptr[4] = img0[1]; + tmpptr[5] = img1[1]; + tmpptr[6] = img2[1]; + tmpptr[7] = img3[1]; + tmpptr += 8; + + img0 += size; + img1 += size; + img2 += size; + img3 += size; + } + } + for (; q < inch; q++) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img0[1]; + + tmpptr += 2; + + img0 += size; + } + } + } + + remain_size_start += nn_size << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + signed char* tmpptr = tmp.channel(i / 2 + i % 2); + + int q = 0; + for (; q + 3 < inch; q += 4) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i; + const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i; + const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img1[0]; + tmpptr[2] = img2[0]; + tmpptr[3] = img3[0]; + tmpptr += 4; + + img0 += size; + img1 += size; + img2 += size; + img3 += size; + } + } + for (; q < inch; q++) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + + tmpptr += 1; + + img0 += size; + } + } + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr0 = top_blob.channel(p); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p); + + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + __m128i _sum00 = __lsx_vreplgr2vr_w(0); + __m128i _sum10 = __lsx_vreplgr2vr_w(0); + + if (nn4 > 0) + { + __m128i _sum01 = __lsx_vreplgr2vr_w(0); + __m128i _sum02 = __lsx_vreplgr2vr_w(0); + __m128i _sum03 = __lsx_vreplgr2vr_w(0); + __m128i _sum11 = __lsx_vreplgr2vr_w(0); + __m128i _sum12 = __lsx_vreplgr2vr_w(0); + __m128i _sum13 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __builtin_prefetch(tmpptr + 32); + __builtin_prefetch(kptr + 64); + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val01 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _val0 = __lsx_vilvl_d(_val01, _val01); + __m128i _val1 = __lsx_vilvh_d(_val01, _val01); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + + __m128i _s00 = __lsx_vmul_h(_val0, _w0); + __m128i _s01 = __lsx_vmul_h(_val0, _w1); + __m128i _s10 = __lsx_vmul_h(_val1, _w0); + __m128i _s11 = __lsx_vmul_h(_val1, _w1); + + __m128i _exts00 = __lsx_vslti_h(_s00, 0); + __m128i _exts01 = __lsx_vslti_h(_s01, 0); + __m128i _exts10 = __lsx_vslti_h(_s10, 0); + __m128i _exts11 = __lsx_vslti_h(_s11, 0); + __m128i _s00l = __lsx_vilvl_h(_exts00, _s00); + __m128i _s00h = __lsx_vilvh_h(_exts00, _s00); + __m128i _s01l = __lsx_vilvl_h(_exts01, _s01); + __m128i _s01h = __lsx_vilvh_h(_exts01, _s01); + __m128i _s10l = __lsx_vilvl_h(_exts10, _s10); + __m128i _s10h = __lsx_vilvh_h(_exts10, _s10); + __m128i _s11l = __lsx_vilvl_h(_exts11, _s11); + __m128i _s11h = __lsx_vilvh_h(_exts11, _s11); + + _sum00 = __lsx_vadd_w(_sum00, _s00l); + _sum01 = __lsx_vadd_w(_sum01, _s00h); + _sum02 = __lsx_vadd_w(_sum02, _s01l); + _sum03 = __lsx_vadd_w(_sum03, _s01h); + _sum10 = __lsx_vadd_w(_sum10, _s10l); + _sum11 = __lsx_vadd_w(_sum11, _s10h); + _sum12 = __lsx_vadd_w(_sum12, _s11l); + _sum13 = __lsx_vadd_w(_sum13, _s11h); + + tmpptr += 8; + kptr += 16; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum01, _sum00); + _tmp1 = __lsx_vilvl_w(_sum03, _sum02); + _tmp2 = __lsx_vilvh_w(_sum01, _sum00); + _tmp3 = __lsx_vilvh_w(_sum03, _sum02); + _sum00 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum01 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum02 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum03 = __lsx_vilvh_d(_tmp3, _tmp2); + } + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum11, _sum10); + _tmp1 = __lsx_vilvl_w(_sum13, _sum12); + _tmp2 = __lsx_vilvh_w(_sum11, _sum10); + _tmp3 = __lsx_vilvh_w(_sum13, _sum12); + _sum10 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum11 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum12 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum13 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum00 = __lsx_vadd_w(_sum00, _sum01); + _sum02 = __lsx_vadd_w(_sum02, _sum03); + _sum10 = __lsx_vadd_w(_sum10, _sum11); + _sum12 = __lsx_vadd_w(_sum12, _sum13); + + _sum00 = __lsx_vadd_w(_sum00, _sum02); + _sum10 = __lsx_vadd_w(_sum10, _sum12); + } + + int j = 0; + for (; j < nn1; j++) + { + __m128i _val0 = __lsx_vreplgr2vr_h(tmpptr[0]); + __m128i _val1 = __lsx_vreplgr2vr_h(tmpptr[1]); + __m128i _val = __lsx_vilvl_d(_val1, _val0); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + _w16 = __lsx_vilvl_d(_w16, _w16); + + __m128i _s0 = __lsx_vmul_h(_val, _w16); + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + + _sum00 = __lsx_vadd_w(_sum00, _s0l); + _sum10 = __lsx_vadd_w(_sum10, _s0h); + + tmpptr += 2; + kptr += 4; + } + + __lsx_vst(_sum00, outptr0, 0); + __lsx_vst(_sum10, outptr0 + 4, 0); + outptr0 += 8; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p); + + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + + if (nn4 > 0) + { + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr + 64); + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + _val16 = __lsx_vilvl_d(_val16, _val16); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + + __m128i _s0 = __lsx_vmul_h(_val16, _w0); + __m128i _s1 = __lsx_vmul_h(_val16, _w1); + + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _exts1 = __lsx_vslti_h(_s1, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + __m128i _s1l = __lsx_vilvl_h(_exts1, _s1); + __m128i _s1h = __lsx_vilvh_h(_exts1, _s1); + + _sum0 = __lsx_vadd_w(_sum0, _s0l); + _sum1 = __lsx_vadd_w(_sum1, _s0h); + _sum2 = __lsx_vadd_w(_sum2, _s1l); + _sum3 = __lsx_vadd_w(_sum3, _s1h); + + tmpptr += 4; + kptr += 16; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum1, _sum0); + _tmp1 = __lsx_vilvl_w(_sum3, _sum2); + _tmp2 = __lsx_vilvh_w(_sum1, _sum0); + _tmp3 = __lsx_vilvh_w(_sum3, _sum2); + _sum0 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum1 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum2 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum3 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + _sum0 = __lsx_vadd_w(_sum0, _sum2); + } + + int j = 0; + for (; j < nn1; j++) + { + __m128i _val = __lsx_vreplgr2vr_h(tmpptr[0]); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val, _w16); + __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0); + + _sum0 = __lsx_vadd_w(_sum0, _s032); + + tmpptr += 1; + kptr += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + outptr0 += 4; + } + } +} + +static void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // interleave + // src = maxk-inch-outch + // dst = 4a-4b-maxk-inch/4a-outch/4b + Mat kernel = _kernel.reshape(maxk, inch, outch); + if (inch >= 4) + kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4, (size_t)1u); + else + kernel_tm.create(4 * maxk, inch, outch / 4, (size_t)1u); + + for (int q = 0; q + 3 < outch; q += 4) + { + signed char* g00 = kernel_tm.channel(q / 4); + + int p = 0; + for (; p + 3 < inch; p += 4) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + const signed char* k00 = kernel.channel(q + i).row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + const signed char* k00 = kernel.channel(q + i).row(p); + + g00[0] = k00[k]; + + g00++; + } + } + } + } +} + +static void convolution_im2col_sgemm_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator); + { + const int gap = w * stride_h - outw * stride_w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + signed char* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const signed char* sptr = img.row(dilation_h * u) + dilation_w * v; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j + 3 < outw; j += 4) + { + ptr[0] = sptr[0]; + ptr[1] = sptr[stride_w]; + ptr[2] = sptr[stride_w * 2]; + ptr[3] = sptr[stride_w * 3]; + + sptr += stride_w * 4; + ptr += 4; + } + for (; j + 1 < outw; j += 2) + { + ptr[0] = sptr[0]; + ptr[1] = sptr[stride_w]; + + sptr += stride_w * 2; + ptr += 2; + } + for (; j < outw; j++) + { + ptr[0] = sptr[0]; + + sptr += stride_w; + ptr += 1; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_pack1to4_int8_lsx(bottom_im2col, top_blob, kernel, opt); +} diff --git a/src/layer/loongarch/convolution_sgemm_pack4.h b/src/layer/loongarch/convolution_sgemm_pack4.h new file mode 100644 index 000000000000..e3e7279a5d2c --- /dev/null +++ b/src/layer/loongarch/convolution_sgemm_pack4.h @@ -0,0 +1,519 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_pack4_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + const float* bias = _bias; + + // permute + Mat tmp; + if (size >= 12) + tmp.create(12 * maxk, inch, size / 12 + (size % 12) / 8 + (size % 12 % 8) / 4 + (size % 12 % 4) / 2 + size % 12 % 2, 4u * 4, 4, opt.workspace_allocator); + else if (size >= 8) + tmp.create(8 * maxk, inch, size / 8 + (size % 8) / 4 + (size % 4) / 2 + size % 2, 4u * 4, 4, opt.workspace_allocator); + else if (size >= 4) + tmp.create(4 * maxk, inch, size / 4 + (size % 4) / 2 + size % 2, 4u * 4, 4, opt.workspace_allocator); + else if (size >= 2) + tmp.create(2 * maxk, inch, size / 2 + size % 2, 4u * 4, 4, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 4u * 4, 4, opt.workspace_allocator); + { + int remain_size_start = 0; + int nn_size = size / 12; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 12; + + float* tmpptr = tmp.channel(i / 12); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + // transpose 4x12 + __m128i _r0 = __lsx_vld(img0, 0); + __m128i _r1 = __lsx_vld(img0 + 4, 0); + __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0); + __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0); + __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0); + __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0); + __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0); + __m128i _r8 = __lsx_vld(img0 + 4 * 8, 0); + __m128i _r9 = __lsx_vld(img0 + 4 * 9, 0); + __m128i _ra = __lsx_vld(img0 + 4 * 10, 0); + __m128i _rb = __lsx_vld(img0 + 4 * 11, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r45r = __lsx_vilvl_w(_r5, _r4); + __m128i _r45l = __lsx_vilvh_w(_r5, _r4); + __m128i _r67r = __lsx_vilvl_w(_r7, _r6); + __m128i _r67l = __lsx_vilvh_w(_r7, _r6); + __m128i _r89r = __lsx_vilvl_w(_r9, _r8); + __m128i _r89l = __lsx_vilvh_w(_r9, _r8); + __m128i _rabr = __lsx_vilvl_w(_rb, _ra); + __m128i _rabl = __lsx_vilvh_w(_rb, _ra); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r); + __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r); + __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l); + __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l); + __m128i _r89ab_0 = __lsx_vilvl_d(_rabr, _r89r); + __m128i _r89ab_1 = __lsx_vilvh_d(_rabr, _r89r); + __m128i _r89ab_2 = __lsx_vilvl_d(_rabl, _r89l); + __m128i _r89ab_3 = __lsx_vilvh_d(_rabl, _r89l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r4567_0, tmpptr + 4, 0); + __lsx_vst(_r89ab_0, tmpptr + 4 * 2, 0); + __lsx_vst(_r0123_1, tmpptr + 4 * 3, 0); + __lsx_vst(_r4567_1, tmpptr + 4 * 4, 0); + __lsx_vst(_r89ab_1, tmpptr + 4 * 5, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 6, 0); + __lsx_vst(_r4567_2, tmpptr + 4 * 7, 0); + __lsx_vst(_r89ab_2, tmpptr + 4 * 8, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 9, 0); + __lsx_vst(_r4567_3, tmpptr + 4 * 10, 0); + __lsx_vst(_r89ab_3, tmpptr + 4 * 11, 0); + + img0 += size * 4; + tmpptr += 48; + } + } + } + + remain_size_start += nn_size * 12; + nn_size = (size - remain_size_start) >> 3; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 8; + + float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + // transpose 4x8 + __m128i _r0 = __lsx_vld(img0, 0); + __m128i _r1 = __lsx_vld(img0 + 4, 0); + __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0); + __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0); + __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0); + __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0); + __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r45r = __lsx_vilvl_w(_r5, _r4); + __m128i _r45l = __lsx_vilvh_w(_r5, _r4); + __m128i _r67r = __lsx_vilvl_w(_r7, _r6); + __m128i _r67l = __lsx_vilvh_w(_r7, _r6); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r); + __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r); + __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l); + __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r4567_0, tmpptr + 4, 0); + __lsx_vst(_r0123_1, tmpptr + 4 * 2, 0); + __lsx_vst(_r4567_1, tmpptr + 4 * 3, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 4, 0); + __lsx_vst(_r4567_2, tmpptr + 4 * 5, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 6, 0); + __lsx_vst(_r4567_3, tmpptr + 4 * 7, 0); + + img0 += size * 4; + tmpptr += 32; + } + } + } + + remain_size_start += nn_size << 3; + nn_size = (size - remain_size_start) >> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 4; + + float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(img0, 0); + __m128i _r1 = __lsx_vld(img0 + 4, 0); + __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r0123_1, tmpptr + 4, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 2, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 3, 0); + + img0 += size * 4; + tmpptr += 16; + } + } + } + + remain_size_start += nn_size << 2; + nn_size = (size - remain_size_start) >> 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 2; + + float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + // transpose 4x2 + __m128i _r0 = __lsx_vld(img0, 0); + __m128i _r1 = __lsx_vld(img0 + 4, 0); + + __m128i _r01_0 = __lsx_vilvl_w(_r1, _r0); + __m128i _r01_1 = __lsx_vilvh_w(_r1, _r0); + + __lsx_vst(_r01_0, tmpptr, 0); + __lsx_vst(_r01_1, tmpptr + 4, 0); + + img0 += size * 4; + tmpptr += 8; + } + } + } + + remain_size_start += nn_size << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + __m128i _val = __lsx_vld(img0, 0); + __lsx_vst(_val, tmpptr, 0); + + img0 += size * 4; + tmpptr += 4; + } + } + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + float* outptr0 = top_blob.channel(p); + + int i = 0; + for (; i + 11 < size; i += 12) + { + const float* tmpptr = tmp.channel(i / 12); + const float* kptr0 = kernel.channel(p); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = _sum0; + __m128 _sum2 = _sum0; + __m128 _sum3 = _sum0; + __m128 _sum4 = _sum0; + __m128 _sum5 = _sum0; + __m128 _sum6 = _sum0; + __m128 _sum7 = _sum0; + __m128 _sum8 = _sum0; + __m128 _sum9 = _sum0; + __m128 _suma = _sum0; + __m128 _sumb = _sum0; + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 48); + __builtin_prefetch(kptr0 + 16); + __m128i _val0123 = __lsx_vld(tmpptr, 0); + __m128i _val4567 = __lsx_vld(tmpptr + 4, 0); + __m128i _val89ab = __lsx_vld(tmpptr + 8, 0); + __m128 _w0 = (__m128)__lsx_vld(kptr0, 0); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3); + _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4); + _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5); + _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6); + _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7); + _sum8 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 0), _sum8); + _sum9 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 1), _sum9); + _suma = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 2), _suma); + _sumb = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 3), _sumb); + + tmpptr += 12; + kptr0 += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 4 * 2, 0); + __lsx_vst(_sum3, outptr0 + 4 * 3, 0); + __lsx_vst(_sum4, outptr0 + 4 * 4, 0); + __lsx_vst(_sum5, outptr0 + 4 * 5, 0); + __lsx_vst(_sum6, outptr0 + 4 * 6, 0); + __lsx_vst(_sum7, outptr0 + 4 * 7, 0); + __lsx_vst(_sum8, outptr0 + 4 * 8, 0); + __lsx_vst(_sum9, outptr0 + 4 * 9, 0); + __lsx_vst(_suma, outptr0 + 4 * 10, 0); + __lsx_vst(_sumb, outptr0 + 4 * 11, 0); + + outptr0 += 4 * 12; + } + for (; i + 7 < size; i += 8) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8); + const float* kptr0 = kernel.channel(p); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = _sum0; + __m128 _sum2 = _sum0; + __m128 _sum3 = _sum0; + __m128 _sum4 = _sum0; + __m128 _sum5 = _sum0; + __m128 _sum6 = _sum0; + __m128 _sum7 = _sum0; + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 32); + __builtin_prefetch(kptr0 + 16); + __m128i _val0123 = __lsx_vld(tmpptr, 0); + __m128i _val4567 = __lsx_vld(tmpptr + 4, 0); + __m128 _w0 = (__m128)__lsx_vld(kptr0, 0); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3); + _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4); + _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5); + _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6); + _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7); + + tmpptr += 8; + kptr0 += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 4 * 2, 0); + __lsx_vst(_sum3, outptr0 + 4 * 3, 0); + __lsx_vst(_sum4, outptr0 + 4 * 4, 0); + __lsx_vst(_sum5, outptr0 + 4 * 5, 0); + __lsx_vst(_sum6, outptr0 + 4 * 6, 0); + __lsx_vst(_sum7, outptr0 + 4 * 7, 0); + + outptr0 += 4 * 8; + } + for (; i + 3 < size; i += 4) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4); + const float* kptr0 = kernel.channel(p); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = _sum0; + __m128 _sum2 = _sum0; + __m128 _sum3 = _sum0; + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr0 + 16); + __m128i _val0123 = __lsx_vld(tmpptr, 0); + __m128 _w0 = (__m128)__lsx_vld(kptr0, 0); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3); + + tmpptr += 4; + kptr0 += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 4 * 2, 0); + __lsx_vst(_sum3, outptr0 + 4 * 3, 0); + + outptr0 += 4 * 4; + } + for (; i + 1 < size; i += 2) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2); + const float* kptr0 = kernel.channel(p); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = _sum0; + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 8); + __builtin_prefetch(kptr0 + 16); + __m128 _val0 = __lsx_vreplfr2vr_s(*tmpptr++); + __m128 _val1 = __lsx_vreplfr2vr_s(*tmpptr++); + __m128 _w0 = (__m128)__lsx_vld(kptr0, 0); + _sum0 = __lsx_vfmadd_s(_w0, _val0, _sum0); + _sum1 = __lsx_vfmadd_s(_w0, _val1, _sum1); + + kptr0 += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + + outptr0 += 4 * 2; + } + for (; i < size; i++) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2); + const float* kptr0 = kernel.channel(p); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 4); + __builtin_prefetch(kptr0 + 16); + __m128 _val0 = __lsx_vreplfr2vr_s(*tmpptr++); + __m128 _w0 = (__m128)__lsx_vld(kptr0, 0); + _sum = __lsx_vfmadd_s(_w0, _val0, _sum); + + kptr0 += 4; + } + + __lsx_vst(_sum, outptr0, 0); + + outptr0 += 4; + } + } +} + +static void convolution_im2col_sgemm_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator); + { + const int gap = (w * stride_h - outw * stride_w) * 4; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + float* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const float* sptr = img.row(dilation_h * u) + dilation_w * v * 4; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + __m128 _val = (__m128)__lsx_vld(sptr, 0); + __lsx_vst(_val, ptr, 0); + + sptr += stride_w * 4; + ptr += 4; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_pack4_lsx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/loongarch/convolution_sgemm_pack4to1.h b/src/layer/loongarch/convolution_sgemm_pack4to1.h new file mode 100644 index 000000000000..3748645b4d4c --- /dev/null +++ b/src/layer/loongarch/convolution_sgemm_pack4to1.h @@ -0,0 +1,667 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_pack4to1_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + const float* bias = _bias; + + Mat tmp; + if (size >= 12) + tmp.create(12 * maxk, inch, size / 12 + (size % 12) / 8 + (size % 12 % 8) / 4 + size % 12 % 4, 4u * 4, 4, opt.workspace_allocator); + else if (size >= 8) + tmp.create(8 * maxk, inch, size / 8 + (size % 8) / 4 + size % 4, 4u * 4, 4, opt.workspace_allocator); + else if (size >= 4) + tmp.create(4 * maxk, inch, size / 4 + size % 4, 4u * 4, 4, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 4u * 4, 4, opt.workspace_allocator); + { + int remain_size_start = 0; + int nn_size = size / 12; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 12; + + float* tmpptr = tmp.channel(i / 12); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + // transpose 4x12 + __m128i _r0 = __lsx_vld(img0, 0); + __m128i _r1 = __lsx_vld(img0 + 4, 0); + __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0); + __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0); + __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0); + __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0); + __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0); + __m128i _r8 = __lsx_vld(img0 + 4 * 8, 0); + __m128i _r9 = __lsx_vld(img0 + 4 * 9, 0); + __m128i _ra = __lsx_vld(img0 + 4 * 10, 0); + __m128i _rb = __lsx_vld(img0 + 4 * 11, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r45r = __lsx_vilvl_w(_r5, _r4); + __m128i _r45l = __lsx_vilvh_w(_r5, _r4); + __m128i _r67r = __lsx_vilvl_w(_r7, _r6); + __m128i _r67l = __lsx_vilvh_w(_r7, _r6); + __m128i _r89r = __lsx_vilvl_w(_r9, _r8); + __m128i _r89l = __lsx_vilvh_w(_r9, _r8); + __m128i _rabr = __lsx_vilvl_w(_rb, _ra); + __m128i _rabl = __lsx_vilvh_w(_rb, _ra); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r); + __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r); + __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l); + __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l); + __m128i _r89ab_0 = __lsx_vilvl_d(_rabr, _r89r); + __m128i _r89ab_1 = __lsx_vilvh_d(_rabr, _r89r); + __m128i _r89ab_2 = __lsx_vilvl_d(_rabl, _r89l); + __m128i _r89ab_3 = __lsx_vilvh_d(_rabl, _r89l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r4567_0, tmpptr + 4, 0); + __lsx_vst(_r89ab_0, tmpptr + 4 * 2, 0); + __lsx_vst(_r0123_1, tmpptr + 4 * 3, 0); + __lsx_vst(_r4567_1, tmpptr + 4 * 4, 0); + __lsx_vst(_r89ab_1, tmpptr + 4 * 5, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 6, 0); + __lsx_vst(_r4567_2, tmpptr + 4 * 7, 0); + __lsx_vst(_r89ab_2, tmpptr + 4 * 8, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 9, 0); + __lsx_vst(_r4567_3, tmpptr + 4 * 10, 0); + __lsx_vst(_r89ab_3, tmpptr + 4 * 11, 0); + + img0 += size * 4; + tmpptr += 48; + } + } + } + + remain_size_start += nn_size * 12; + nn_size = (size - remain_size_start) >> 3; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 8; + + float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + // transpose 4x8 + __m128i _r0 = __lsx_vld(img0, 0); + __m128i _r1 = __lsx_vld(img0 + 4, 0); + __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0); + __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0); + __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0); + __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0); + __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r45r = __lsx_vilvl_w(_r5, _r4); + __m128i _r45l = __lsx_vilvh_w(_r5, _r4); + __m128i _r67r = __lsx_vilvl_w(_r7, _r6); + __m128i _r67l = __lsx_vilvh_w(_r7, _r6); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r); + __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r); + __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l); + __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r4567_0, tmpptr + 4, 0); + __lsx_vst(_r0123_1, tmpptr + 4 * 2, 0); + __lsx_vst(_r4567_1, tmpptr + 4 * 3, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 4, 0); + __lsx_vst(_r4567_2, tmpptr + 4 * 5, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 6, 0); + __lsx_vst(_r4567_3, tmpptr + 4 * 7, 0); + + img0 += size * 4; + tmpptr += 32; + } + } + } + + remain_size_start += nn_size << 3; + nn_size = (size - remain_size_start) >> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 4; + + float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(img0, 0); + __m128i _r1 = __lsx_vld(img0 + 4, 0); + __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r0123_1, tmpptr + 4, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 2, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 3, 0); + + img0 += size * 4; + tmpptr += 16; + } + } + } + + remain_size_start += nn_size << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + __m128 _val = (__m128)__lsx_vld(img0, 0); + __lsx_vst(_val, tmpptr, 0); + + img0 += size * 4; + tmpptr += 4; + } + } + } + } + + int nn_outch = outch / 4; + int remain_outch_start = nn_outch * 4; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 4; + + float* outptr0 = top_blob.channel(p); + float* outptr1 = top_blob.channel(p + 1); + float* outptr2 = top_blob.channel(p + 2); + float* outptr3 = top_blob.channel(p + 3); + + const float zeros[4] = {0.f}; + const float* biasptr = bias ? bias + p : zeros; + + int i = 0; + for (; i + 11 < size; i += 12) + { + const float* tmpptr = tmp.channel(i / 12); + const float* kptr0 = kernel.channel(p / 4); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128i _bias = __lsx_vld(biasptr, 0); + __m128 _sum0 = (__m128)__lsx_vreplvei_w(_bias, 0); + __m128 _sum1 = (__m128)__lsx_vreplvei_w(_bias, 0); + __m128 _sum2 = (__m128)__lsx_vreplvei_w(_bias, 0); + __m128 _sum3 = (__m128)__lsx_vreplvei_w(_bias, 1); + __m128 _sum4 = (__m128)__lsx_vreplvei_w(_bias, 1); + __m128 _sum5 = (__m128)__lsx_vreplvei_w(_bias, 1); + __m128 _sum6 = (__m128)__lsx_vreplvei_w(_bias, 2); + __m128 _sum7 = (__m128)__lsx_vreplvei_w(_bias, 2); + __m128 _sum8 = (__m128)__lsx_vreplvei_w(_bias, 2); + __m128 _sum9 = (__m128)__lsx_vreplvei_w(_bias, 3); + __m128 _suma = (__m128)__lsx_vreplvei_w(_bias, 3); + __m128 _sumb = (__m128)__lsx_vreplvei_w(_bias, 3); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 48); + __builtin_prefetch(kptr0 + 16); + __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0); + __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0); + __m128 _val2 = (__m128)__lsx_vld(tmpptr + 8, 0); + __m128i _w0123 = __lsx_vld(kptr0, 0); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val0, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val1, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val2, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val0, _sum3); + _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val1, _sum4); + _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val2, _sum5); + _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val0, _sum6); + _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val1, _sum7); + _sum8 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val2, _sum8); + _sum9 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val0, _sum9); + _suma = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val1, _suma); + _sumb = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val2, _sumb); + + tmpptr += 12; + kptr0 += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 8, 0); + __lsx_vst(_sum3, outptr1, 0); + __lsx_vst(_sum4, outptr1 + 4, 0); + __lsx_vst(_sum5, outptr1 + 8, 0); + __lsx_vst(_sum6, outptr2, 0); + __lsx_vst(_sum7, outptr2 + 4, 0); + __lsx_vst(_sum8, outptr2 + 8, 0); + __lsx_vst(_sum9, outptr3, 0); + __lsx_vst(_suma, outptr3 + 4, 0); + __lsx_vst(_sumb, outptr3 + 8, 0); + + outptr0 += 12; + outptr1 += 12; + outptr2 += 12; + outptr3 += 12; + } + for (; i + 7 < size; i += 8) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8); + const float* kptr0 = kernel.channel(p / 4); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128i _bias = __lsx_vld(biasptr, 0); + __m128 _sum0 = (__m128)__lsx_vreplvei_w(_bias, 0); + __m128 _sum1 = (__m128)__lsx_vreplvei_w(_bias, 0); + __m128 _sum2 = (__m128)__lsx_vreplvei_w(_bias, 1); + __m128 _sum3 = (__m128)__lsx_vreplvei_w(_bias, 1); + __m128 _sum4 = (__m128)__lsx_vreplvei_w(_bias, 2); + __m128 _sum5 = (__m128)__lsx_vreplvei_w(_bias, 2); + __m128 _sum6 = (__m128)__lsx_vreplvei_w(_bias, 3); + __m128 _sum7 = (__m128)__lsx_vreplvei_w(_bias, 3); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 32); + __builtin_prefetch(kptr0 + 16); + __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0); + __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0); + __m128i _w0123 = __lsx_vld(kptr0, 0); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val0, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val1, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val0, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val1, _sum3); + _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val0, _sum4); + _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val1, _sum5); + _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val0, _sum6); + _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val1, _sum7); + + tmpptr += 8; + kptr0 += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr1, 0); + __lsx_vst(_sum3, outptr1 + 4, 0); + __lsx_vst(_sum4, outptr2, 0); + __lsx_vst(_sum5, outptr2 + 4, 0); + __lsx_vst(_sum6, outptr3, 0); + __lsx_vst(_sum7, outptr3 + 4, 0); + + outptr0 += 8; + outptr1 += 8; + outptr2 += 8; + outptr3 += 8; + } + for (; i + 3 < size; i += 4) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4); + const float* kptr0 = kernel.channel(p / 4); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128i _bias = __lsx_vld(biasptr, 0); + __m128 _sum0 = (__m128)__lsx_vreplvei_w(_bias, 0); + __m128 _sum1 = (__m128)__lsx_vreplvei_w(_bias, 1); + __m128 _sum2 = (__m128)__lsx_vreplvei_w(_bias, 2); + __m128 _sum3 = (__m128)__lsx_vreplvei_w(_bias, 3); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr0 + 16); + __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0); + __m128i _w0123 = __lsx_vld(kptr0, 0); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val0, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val0, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val0, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val0, _sum3); + + tmpptr += 4; + kptr0 += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr1, 0); + __lsx_vst(_sum2, outptr2, 0); + __lsx_vst(_sum3, outptr3, 0); + + outptr0 += 4; + outptr1 += 4; + outptr2 += 4; + outptr3 += 4; + } + for (; i < size; i++) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4); + const float* kptr0 = kernel.channel(p / 4); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum = (__m128)__lsx_vld(biasptr, 0); + float* _sum_p = (float*)&_sum; + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 4); + __builtin_prefetch(kptr0 + 16); + __m128 _val0 = __lsx_vreplfr2vr_s(*tmpptr++); + __m128 _w0 = (__m128)__lsx_vld(kptr0, 0); + _sum = __lsx_vfmadd_s(_w0, _val0, _sum); + + kptr0 += 4; + } + + outptr0[0] = _sum_p[0]; + outptr1[0] = _sum_p[1]; + outptr2[0] = _sum_p[2]; + outptr3[0] = _sum_p[3]; + + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + float* outptr0 = top_blob.channel(p); + + const float bias0 = bias ? bias[p] : 0.f; + + int i = 0; + for (; i + 11 < size; i += 12) + { + const float* tmpptr = tmp.channel(i / 12); + const float* kptr0 = kernel.channel(p / 4 + p % 4); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum0 = __lsx_vreplfr2vr_s(bias0); + __m128 _sum1 = __lsx_vreplfr2vr_s(bias0); + __m128 _sum2 = __lsx_vreplfr2vr_s(bias0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 48); + __builtin_prefetch(kptr0 + 4); + __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0); + __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0); + __m128 _val2 = (__m128)__lsx_vld(tmpptr + 8, 0); + __m128 _w0 = __lsx_vreplfr2vr_s(*kptr0); + _sum0 = __lsx_vfmadd_s(_val0, _w0, _sum0); + _sum1 = __lsx_vfmadd_s(_val1, _w0, _sum1); + _sum2 = __lsx_vfmadd_s(_val2, _w0, _sum2); + + tmpptr += 12; + kptr0 += 1; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 8, 0); + + outptr0 += 12; + } + for (; i + 7 < size; i += 8) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8); + const float* kptr0 = kernel.channel(p / 4 + p % 4); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum0 = __lsx_vreplfr2vr_s(bias0); + __m128 _sum1 = __lsx_vreplfr2vr_s(bias0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 32); + __builtin_prefetch(kptr0 + 4); + __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0); + __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0); + __m128 _w0 = __lsx_vreplfr2vr_s(*kptr0); + _sum0 = __lsx_vfmadd_s(_val0, _w0, _sum0); + _sum1 = __lsx_vfmadd_s(_val1, _w0, _sum1); + + tmpptr += 8; + kptr0 += 1; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + + outptr0 += 8; + } + for (; i + 3 < size; i += 4) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4); + const float* kptr0 = kernel.channel(p / 4 + p % 4); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum0 = __lsx_vreplfr2vr_s(bias0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr0 + 4); + __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0); + __m128 _w0 = __lsx_vreplfr2vr_s(*kptr0); + _sum0 = __lsx_vfmadd_s(_val0, _w0, _sum0); + + tmpptr += 4; + kptr0 += 1; + } + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; + } + for (; i < size; i++) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4); + const float* kptr0 = kernel.channel(p / 4 + p % 4); + + int nn = inch * maxk; // inch always > 0 + + float sum0 = bias0; + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr0 + 16); + __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0); + __m128 _w0 = (__m128)__lsx_vld(kptr0, 0); + _sum0 = __lsx_vfmadd_s(_w0, _val0, _sum0); + tmpptr += 4; + kptr0 += 4; + } + + sum0 += __lsx_reduce_fadd_s(_sum0); + + outptr0[0] = sum0; + + outptr0 += 1; + } + } +} + +static void convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // interleave + // src = maxk-inch-outch + // dst = pb-pa-maxk-inch/pa-outch/pb + Mat kernel = _kernel.reshape(maxk, inch, outch); + kernel_tm.create(4 * 4 * maxk, inch / 4, outch / 4 + outch % 4); + + int q = 0; + for (; q + 3 < outch; q += 4) + { + float* g00 = kernel_tm.channel(q / 4); + + for (int p = 0; p + 3 < inch; p += 4) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + const float* k00 = kernel.channel(q + j).row(p + i); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } + for (; q < outch; q++) + { + const Mat k0 = kernel.channel(q); + + float* g00 = kernel_tm.channel(q / 4 + q % 4); + + for (int p = 0; p + 3 < inch; p += 4) + { + for (int k = 0; k < maxk; k++) + { + for (int j = 0; j < 4; j++) + { + const float* k00 = k0.row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } +} + +static void convolution_im2col_sgemm_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator); + { + const int gap = (w * stride_h - outw * stride_w) * 4; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + float* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const float* sptr = img.row(dilation_h * u) + dilation_w * v * 4; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + __m128 _val = (__m128)__lsx_vld(sptr, 0); + __lsx_vst(_val, ptr, 0); + + sptr += stride_w * 4; + ptr += 4; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_pack4to1_lsx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/loongarch/convolution_sgemm_pack8to1_int8.h b/src/layer/loongarch/convolution_sgemm_pack8to1_int8.h new file mode 100644 index 000000000000..98d11a574b0e --- /dev/null +++ b/src/layer/loongarch/convolution_sgemm_pack8to1_int8.h @@ -0,0 +1,458 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_pack8to1_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + // permute + Mat tmp; + if (size >= 2) + tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator); + { + int remain_size_start = 0; + int nn_size = (size - remain_size_start) >> 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 2; + + int64_t* tmpptr = tmp.channel(i / 2); + + for (int q = 0; q < inch; q++) + { + const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + __m128i _v = __lsx_vld(img0, 0); + __lsx_vst(_v, tmpptr, 0); + tmpptr += 2; + img0 += size; + } + } + } + + remain_size_start += nn_size << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + int64_t* tmpptr = tmp.channel(i / 2 + i % 2); + + for (int q = 0; q < inch; q++) + { + const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr += 1; + img0 += size; + } + } + } + } + + int nn_outch = 0; + int remain_outch_start = 0; + + nn_outch = outch >> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 4; + + int* outptr0 = top_blob.channel(p); + int* outptr1 = top_blob.channel(p + 1); + int* outptr2 = top_blob.channel(p + 2); + int* outptr3 = top_blob.channel(p + 3); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p / 4); + + int nn = inch * maxk; // inch always > 0 + + __m128i _sum00 = __lsx_vreplgr2vr_w(0); + __m128i _sum01 = __lsx_vreplgr2vr_w(0); + __m128i _sum02 = __lsx_vreplgr2vr_w(0); + __m128i _sum03 = __lsx_vreplgr2vr_w(0); + __m128i _sum10 = __lsx_vreplgr2vr_w(0); + __m128i _sum11 = __lsx_vreplgr2vr_w(0); + __m128i _sum12 = __lsx_vreplgr2vr_w(0); + __m128i _sum13 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn; j++) + { + __builtin_prefetch(tmpptr + 64); + __builtin_prefetch(kptr + 128); + __m128i _val01 = __lsx_vld(tmpptr, 0); + __m128i _extval01 = __lsx_vslti_b(_val01, 0); + __m128i _val0 = __lsx_vilvl_b(_extval01, _val01); + __m128i _val1 = __lsx_vilvh_b(_extval01, _val01); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _w23 = __lsx_vld(kptr + 16, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _extw23 = __lsx_vslti_b(_w23, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + __m128i _w2 = __lsx_vilvl_b(_extw23, _w23); + __m128i _w3 = __lsx_vilvh_b(_extw23, _w23); + + __m128i _s00 = __lsx_vmul_h(_val0, _w0); + __m128i _s01 = __lsx_vmul_h(_val0, _w1); + __m128i _s02 = __lsx_vmul_h(_val0, _w2); + __m128i _s03 = __lsx_vmul_h(_val0, _w3); + __m128i _s10 = __lsx_vmul_h(_val1, _w0); + __m128i _s11 = __lsx_vmul_h(_val1, _w1); + __m128i _s12 = __lsx_vmul_h(_val1, _w2); + __m128i _s13 = __lsx_vmul_h(_val1, _w3); + + _sum00 = __lsx_vadd_w(_sum00, __lsx_vhaddw_w_h(_s00, _s00)); + _sum01 = __lsx_vadd_w(_sum01, __lsx_vhaddw_w_h(_s01, _s01)); + _sum02 = __lsx_vadd_w(_sum02, __lsx_vhaddw_w_h(_s02, _s02)); + _sum03 = __lsx_vadd_w(_sum03, __lsx_vhaddw_w_h(_s03, _s03)); + _sum10 = __lsx_vadd_w(_sum10, __lsx_vhaddw_w_h(_s10, _s10)); + _sum11 = __lsx_vadd_w(_sum11, __lsx_vhaddw_w_h(_s11, _s11)); + _sum12 = __lsx_vadd_w(_sum12, __lsx_vhaddw_w_h(_s12, _s12)); + _sum13 = __lsx_vadd_w(_sum13, __lsx_vhaddw_w_h(_s13, _s13)); + + tmpptr += 16; + kptr += 32; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum01, _sum00); + _tmp1 = __lsx_vilvl_w(_sum03, _sum02); + _tmp2 = __lsx_vilvh_w(_sum01, _sum00); + _tmp3 = __lsx_vilvh_w(_sum03, _sum02); + _sum00 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum01 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum02 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum03 = __lsx_vilvh_d(_tmp3, _tmp2); + } + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum11, _sum10); + _tmp1 = __lsx_vilvl_w(_sum13, _sum12); + _tmp2 = __lsx_vilvh_w(_sum11, _sum10); + _tmp3 = __lsx_vilvh_w(_sum13, _sum12); + _sum10 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum11 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum12 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum13 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum00 = __lsx_vadd_w(_sum00, _sum01); + _sum02 = __lsx_vadd_w(_sum02, _sum03); + _sum10 = __lsx_vadd_w(_sum10, _sum11); + _sum12 = __lsx_vadd_w(_sum12, _sum13); + + _sum00 = __lsx_vadd_w(_sum00, _sum02); + _sum10 = __lsx_vadd_w(_sum10, _sum12); + + int sum[8]; + __lsx_vst(_sum00, sum, 0); + __lsx_vst(_sum10, sum + 4, 0); + + outptr0[0] = sum[0]; + outptr1[0] = sum[1]; + outptr2[0] = sum[2]; + outptr3[0] = sum[3]; + outptr0[1] = sum[4]; + outptr1[1] = sum[5]; + outptr2[1] = sum[6]; + outptr3[1] = sum[7]; + outptr0 += 2; + outptr1 += 2; + outptr2 += 2; + outptr3 += 2; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p / 4); + + int nn = inch * maxk; // inch always > 0 + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn; j++) + { + __builtin_prefetch(tmpptr + 32); + __builtin_prefetch(kptr + 128); + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _w23 = __lsx_vld(kptr + 16, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _extw23 = __lsx_vslti_b(_w23, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + __m128i _w2 = __lsx_vilvl_b(_extw23, _w23); + __m128i _w3 = __lsx_vilvh_b(_extw23, _w23); + + __m128i _s0 = __lsx_vmul_h(_val16, _w0); + __m128i _s1 = __lsx_vmul_h(_val16, _w1); + __m128i _s2 = __lsx_vmul_h(_val16, _w2); + __m128i _s3 = __lsx_vmul_h(_val16, _w3); + + _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0)); + _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1)); + _sum2 = __lsx_vadd_w(_sum2, __lsx_vhaddw_w_h(_s2, _s2)); + _sum3 = __lsx_vadd_w(_sum3, __lsx_vhaddw_w_h(_s3, _s3)); + + tmpptr += 8; + kptr += 32; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum1, _sum0); + _tmp1 = __lsx_vilvl_w(_sum3, _sum2); + _tmp2 = __lsx_vilvh_w(_sum1, _sum0); + _tmp3 = __lsx_vilvh_w(_sum3, _sum2); + _sum0 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum1 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum2 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum3 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + + _sum0 = __lsx_vadd_w(_sum0, _sum2); + + int sum[4]; + __lsx_vst(_sum0, sum, 0); + + outptr0[0] = sum[0]; + outptr1[0] = sum[1]; + outptr2[0] = sum[2]; + outptr3[0] = sum[3]; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + + remain_outch_start += nn_outch << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + int* outptr0 = top_blob.channel(p); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p / 4 + p % 4); + + int nn = inch * maxk; // inch always > 0 + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn; j++) + { + __builtin_prefetch(tmpptr + 64); + __builtin_prefetch(kptr + 32); + __m128i _val01 = __lsx_vld(tmpptr, 0); + __m128i _extval01 = __lsx_vslti_b(_val01, 0); + __m128i _val0 = __lsx_vilvl_b(_extval01, _val01); + __m128i _val1 = __lsx_vilvh_b(_extval01, _val01); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val0, _w16); + __m128i _s1 = __lsx_vmul_h(_val1, _w16); + + _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0)); + _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1)); + + tmpptr += 16; + kptr += 8; + } + + outptr0[0] = __lsx_reduce_add_w(_sum0); + outptr0[1] = __lsx_reduce_add_w(_sum1); + outptr0 += 2; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p / 4 + p % 4); + + int nn = inch * maxk; // inch always > 0 + + __m128i _sum = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn; j++) + { + __builtin_prefetch(tmpptr + 32); + __builtin_prefetch(kptr + 32); + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val16, _w16); + + _sum = __lsx_vadd_w(_sum, __lsx_vhaddw_w_h(_s0, _s0)); + + tmpptr += 8; + kptr += 8; + } + + outptr0[0] = __lsx_reduce_add_w(_sum); + outptr0 += 1; + } + } +} + +static void convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // interleave + // src = maxk-inch-outch + // dst = 8a-4b-maxk-inch/8a-outch/4b + Mat kernel = _kernel.reshape(maxk, inch, outch); + if (outch >= 4) + kernel_tm.create(32 * maxk, inch / 8, outch / 4 + outch % 4, (size_t)1u); + else + kernel_tm.create(8 * maxk, inch / 8, outch, (size_t)1u); + + int q = 0; + for (; q + 3 < outch; q += 4) + { + signed char* g00 = kernel_tm.channel(q / 4); + + for (int p = 0; p + 7 < inch; p += 8) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 8; j++) + { + const signed char* k00 = kernel.channel(q + i).row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } + // TODO unroll 2 + for (; q < outch; q++) + { + signed char* g00 = kernel_tm.channel(q / 4 + q % 4); + + for (int p = 0; p + 7 < inch; p += 8) + { + for (int k = 0; k < maxk; k++) + { + for (int j = 0; j < 8; j++) + { + const signed char* k00 = kernel.channel(q).row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } +} + +static void convolution_im2col_sgemm_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + { + const int gap = w * stride_h - outw * stride_w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + int64_t* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const int64_t* sptr = img.row(dilation_h * u) + dilation_w * v; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + ptr[0] = sptr[0]; + + sptr += stride_w; + ptr += 1; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_pack8to1_int8_lsx(bottom_im2col, top_blob, kernel, opt); +} diff --git a/src/layer/loongarch/convolution_sgemm_pack8to4_int8.h b/src/layer/loongarch/convolution_sgemm_pack8to4_int8.h new file mode 100644 index 000000000000..ae9090c95606 --- /dev/null +++ b/src/layer/loongarch/convolution_sgemm_pack8to4_int8.h @@ -0,0 +1,324 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_pack8to4_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + // permute + Mat tmp; + if (size >= 2) + tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator); + { + int remain_size_start = 0; + int nn_size = size >> 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 2; + + int64_t* tmpptr = tmp.channel(i / 2); + + for (int q = 0; q < inch; q++) + { + const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + __m128i _v = __lsx_vld(img0, 0); + __lsx_vst(_v, tmpptr, 0); + tmpptr += 2; + img0 += size; + } + } + } + + remain_size_start += nn_size << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + int64_t* tmpptr = tmp.channel(i / 2 + i % 2); + + for (int q = 0; q < inch; q++) + { + const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr += 1; + img0 += size; + } + } + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr0 = top_blob.channel(p); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p); + + int nn = inch * maxk; // inch always > 0 + + __m128i _sum00 = __lsx_vreplgr2vr_w(0); + __m128i _sum01 = __lsx_vreplgr2vr_w(0); + __m128i _sum02 = __lsx_vreplgr2vr_w(0); + __m128i _sum03 = __lsx_vreplgr2vr_w(0); + __m128i _sum10 = __lsx_vreplgr2vr_w(0); + __m128i _sum11 = __lsx_vreplgr2vr_w(0); + __m128i _sum12 = __lsx_vreplgr2vr_w(0); + __m128i _sum13 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn; j++) + { + __builtin_prefetch(tmpptr + 64); + __builtin_prefetch(kptr + 128); + __m128i _val01 = __lsx_vld(tmpptr, 0); + __m128i _extval01 = __lsx_vslti_b(_val01, 0); + __m128i _val0 = __lsx_vilvl_b(_extval01, _val01); + __m128i _val1 = __lsx_vilvh_b(_extval01, _val01); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _w23 = __lsx_vld(kptr + 16, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _extw23 = __lsx_vslti_b(_w23, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + __m128i _w2 = __lsx_vilvl_b(_extw23, _w23); + __m128i _w3 = __lsx_vilvh_b(_extw23, _w23); + + __m128i _s00 = __lsx_vmul_h(_val0, _w0); + __m128i _s01 = __lsx_vmul_h(_val0, _w1); + __m128i _s02 = __lsx_vmul_h(_val0, _w2); + __m128i _s03 = __lsx_vmul_h(_val0, _w3); + __m128i _s10 = __lsx_vmul_h(_val1, _w0); + __m128i _s11 = __lsx_vmul_h(_val1, _w1); + __m128i _s12 = __lsx_vmul_h(_val1, _w2); + __m128i _s13 = __lsx_vmul_h(_val1, _w3); + + _sum00 = __lsx_vadd_w(_sum00, __lsx_vhaddw_w_h(_s00, _s00)); + _sum01 = __lsx_vadd_w(_sum01, __lsx_vhaddw_w_h(_s01, _s01)); + _sum02 = __lsx_vadd_w(_sum02, __lsx_vhaddw_w_h(_s02, _s02)); + _sum03 = __lsx_vadd_w(_sum03, __lsx_vhaddw_w_h(_s03, _s03)); + _sum10 = __lsx_vadd_w(_sum10, __lsx_vhaddw_w_h(_s10, _s10)); + _sum11 = __lsx_vadd_w(_sum11, __lsx_vhaddw_w_h(_s11, _s11)); + _sum12 = __lsx_vadd_w(_sum12, __lsx_vhaddw_w_h(_s12, _s12)); + _sum13 = __lsx_vadd_w(_sum13, __lsx_vhaddw_w_h(_s13, _s13)); + + tmpptr += 16; + kptr += 32; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum01, _sum00); + _tmp1 = __lsx_vilvl_w(_sum03, _sum02); + _tmp2 = __lsx_vilvh_w(_sum01, _sum00); + _tmp3 = __lsx_vilvh_w(_sum03, _sum02); + _sum00 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum01 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum02 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum03 = __lsx_vilvh_d(_tmp3, _tmp2); + } + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum11, _sum10); + _tmp1 = __lsx_vilvl_w(_sum13, _sum12); + _tmp2 = __lsx_vilvh_w(_sum11, _sum10); + _tmp3 = __lsx_vilvh_w(_sum13, _sum12); + _sum10 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum11 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum12 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum13 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum00 = __lsx_vadd_w(_sum00, _sum01); + _sum02 = __lsx_vadd_w(_sum02, _sum03); + _sum10 = __lsx_vadd_w(_sum10, _sum11); + _sum12 = __lsx_vadd_w(_sum12, _sum13); + + _sum00 = __lsx_vadd_w(_sum00, _sum02); + _sum10 = __lsx_vadd_w(_sum10, _sum12); + + __lsx_vst(_sum00, outptr0, 0); + __lsx_vst(_sum10, outptr0 + 4, 0); + outptr0 += 8; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p); + + int nn = inch * maxk; // inch always > 0 + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn; j++) + { + __builtin_prefetch(tmpptr + 32); + __builtin_prefetch(kptr + 128); + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _w23 = __lsx_vld(kptr + 16, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _extw23 = __lsx_vslti_b(_w23, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + __m128i _w2 = __lsx_vilvl_b(_extw23, _w23); + __m128i _w3 = __lsx_vilvh_b(_extw23, _w23); + + __m128i _s0 = __lsx_vmul_h(_val16, _w0); + __m128i _s1 = __lsx_vmul_h(_val16, _w1); + __m128i _s2 = __lsx_vmul_h(_val16, _w2); + __m128i _s3 = __lsx_vmul_h(_val16, _w3); + + _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0)); + _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1)); + _sum2 = __lsx_vadd_w(_sum2, __lsx_vhaddw_w_h(_s2, _s2)); + _sum3 = __lsx_vadd_w(_sum3, __lsx_vhaddw_w_h(_s3, _s3)); + + tmpptr += 8; + kptr += 32; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum1, _sum0); + _tmp1 = __lsx_vilvl_w(_sum3, _sum2); + _tmp2 = __lsx_vilvh_w(_sum1, _sum0); + _tmp3 = __lsx_vilvh_w(_sum3, _sum2); + _sum0 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum1 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum2 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum3 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + + _sum0 = __lsx_vadd_w(_sum0, _sum2); + + __lsx_vst(_sum0, outptr0, 0); + outptr0 += 4; + } + } +} + +static void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // interleave + // src = maxk-inch-outch + // dst = 8a-4b-maxk-inch/8a-outch/4b + Mat kernel = _kernel.reshape(maxk, inch, outch); + kernel_tm.create(32 * maxk, inch / 8, outch / 4, (size_t)1u); + + for (int q = 0; q + 3 < outch; q += 4) + { + signed char* g00 = kernel_tm.channel(q / 4); + + for (int p = 0; p + 7 < inch; p += 8) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 8; j++) + { + const signed char* k00 = kernel.channel(q + i).row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } +} + +static void convolution_im2col_sgemm_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + { + const int gap = w * stride_h - outw * stride_w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + int64_t* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const int64_t* sptr = img.row(dilation_h * u) + dilation_w * v; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + ptr[0] = sptr[0]; + + sptr += stride_w; + ptr += 1; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_pack8to4_int8_lsx(bottom_im2col, top_blob, kernel, opt); +} diff --git a/src/layer/loongarch/convolution_winograd_dot.h b/src/layer/loongarch/convolution_winograd_dot.h new file mode 100644 index 000000000000..9dbbe4955490 --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_dot.h @@ -0,0 +1,495 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_winograd_dot_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt) +{ + // Mat bottom_blob_tm(tiles, 16/36/64, inch, 4u, opt.workspace_allocator); + + const int tiles = bottom_blob_tm.w; + const int batch = bottom_blob_tm.h; + const int inch = bottom_blob_tm.c; + + // permute + Mat bottom_blob_tm2; + if (tiles >= 4) + bottom_blob_tm2.create(4 * inch, tiles / 4 + tiles % 4, batch, 4u, opt.workspace_allocator); + else + bottom_blob_tm2.create(1 * inch, tiles, batch, 4u, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int r = 0; r < batch; r++) + { + Mat tm2 = bottom_blob_tm2.channel(r); + + // tile + int i = 0; + for (; i + 3 < tiles; i += 4) + { + float* tmpptr = tm2.row(i / 4); + + const float* r0 = bottom_blob_tm; + + r0 += (r * tiles + i); + + for (int q = 0; q < inch; q++) + { +#if __loongarch_sx + __lsx_vst(__lsx_vld(r0, 0), tmpptr, 0); +#else + tmpptr[0] = r0[0]; + tmpptr[1] = r0[1]; + tmpptr[2] = r0[2]; + tmpptr[3] = r0[3]; +#endif + + r0 += bottom_blob_tm.cstep; + tmpptr += 4; + } + } + for (; i < tiles; i++) + { + float* tmpptr = tm2.row(i / 4 + i % 4); + + const float* r0 = bottom_blob_tm; + + r0 += (r * tiles + i); + + for (int q = 0; q < inch; q++) + { + tmpptr[0] = r0[0]; + + r0 += bottom_blob_tm.cstep; + tmpptr += 1; + } + } + } + + bottom_blob_tm = Mat(); + // permute end + + top_blob_tm.create(tiles, batch, outch, 4u, opt.workspace_allocator); + +#if __loongarch_sx + int nn_outch = outch >> 3; + int remain_outch_start = nn_outch << 3; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 8; + + float* output0_tm = top_blob_tm.channel(p); + float* output1_tm = top_blob_tm.channel(p + 1); + float* output2_tm = top_blob_tm.channel(p + 2); + float* output3_tm = top_blob_tm.channel(p + 3); + float* output4_tm = top_blob_tm.channel(p + 4); + float* output5_tm = top_blob_tm.channel(p + 5); + float* output6_tm = top_blob_tm.channel(p + 6); + float* output7_tm = top_blob_tm.channel(p + 7); + + const Mat kernel0_tm = kernel_tm.channel(p / 8); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 3 < tiles; i += 4) + { + const float* r0 = bb2.row(i / 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum4 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum5 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum6 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum7 = (__m128)__lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(k0 + 32); + __m128 _val = (__m128)__lsx_vld(r0, 0); + __m128i _w0123 = __lsx_vld(k0, 0); + __m128i _w4567 = __lsx_vld(k0 + 4, 0); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3); + _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 0), _val, _sum4); + _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 1), _val, _sum5); + _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 2), _val, _sum6); + _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 3), _val, _sum7); + + r0 += 4; + k0 += 8; + } + + __lsx_vst(_sum0, output0_tm, 0); + __lsx_vst(_sum1, output1_tm, 0); + __lsx_vst(_sum2, output2_tm, 0); + __lsx_vst(_sum3, output3_tm, 0); + __lsx_vst(_sum4, output4_tm, 0); + __lsx_vst(_sum5, output5_tm, 0); + __lsx_vst(_sum6, output6_tm, 0); + __lsx_vst(_sum7, output7_tm, 0); + + output0_tm += 4; + output1_tm += 4; + output2_tm += 4; + output3_tm += 4; + output4_tm += 4; + output5_tm += 4; + output6_tm += 4; + output7_tm += 4; + } + for (; i < tiles; i++) + { + const float* r0 = bb2.row(i / 4 + i % 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + float sum0 = 0.f; + float sum1 = 0.f; + float sum2 = 0.f; + float sum3 = 0.f; + float sum4 = 0.f; + float sum5 = 0.f; + float sum6 = 0.f; + float sum7 = 0.f; + + int j = 0; + for (; j < nn; j++) + { + sum0 += r0[0] * k0[0]; + sum1 += r0[0] * k0[1]; + sum2 += r0[0] * k0[2]; + sum3 += r0[0] * k0[3]; + sum4 += r0[0] * k0[4]; + sum5 += r0[0] * k0[5]; + sum6 += r0[0] * k0[6]; + sum7 += r0[0] * k0[7]; + + r0 += 1; + k0 += 8; + } + + output0_tm[0] = sum0; + output1_tm[0] = sum1; + output2_tm[0] = sum2; + output3_tm[0] = sum3; + output4_tm[0] = sum4; + output5_tm[0] = sum5; + output6_tm[0] = sum6; + output7_tm[0] = sum7; + + output0_tm++; + output1_tm++; + output2_tm++; + output3_tm++; + output4_tm++; + output5_tm++; + output6_tm++; + output7_tm++; + } + } + } + + nn_outch = (outch - remain_outch_start) >> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = remain_outch_start + pp * 4; + + float* output0_tm = top_blob_tm.channel(p); + float* output1_tm = top_blob_tm.channel(p + 1); + float* output2_tm = top_blob_tm.channel(p + 2); + float* output3_tm = top_blob_tm.channel(p + 3); + + const Mat kernel0_tm = kernel_tm.channel(p / 8 + (p % 8) / 4); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 3 < tiles; i += 4) + { + const float* r0 = bb2.row(i / 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(k0 + 16); + __m128 _val = (__m128)__lsx_vld(r0, 0); + __m128i _w0123 = __lsx_vld(k0, 0); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3); + + r0 += 4; + k0 += 4; + } + + __lsx_vst(_sum0, output0_tm, 0); + __lsx_vst(_sum1, output1_tm, 0); + __lsx_vst(_sum2, output2_tm, 0); + __lsx_vst(_sum3, output3_tm, 0); + + output0_tm += 4; + output1_tm += 4; + output2_tm += 4; + output3_tm += 4; + } + for (; i < tiles; i++) + { + const float* r0 = bb2.row(i / 4 + i % 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + float sum0 = 0.f; + float sum1 = 0.f; + float sum2 = 0.f; + float sum3 = 0.f; + + int j = 0; + for (; j < nn; j++) + { + sum0 += r0[0] * k0[0]; + sum1 += r0[0] * k0[1]; + sum2 += r0[0] * k0[2]; + sum3 += r0[0] * k0[3]; + + r0 += 1; + k0 += 4; + } + + output0_tm[0] = sum0; + output1_tm[0] = sum1; + output2_tm[0] = sum2; + output3_tm[0] = sum3; + + output0_tm++; + output1_tm++; + output2_tm++; + output3_tm++; + } + } + } + + remain_outch_start += nn_outch << 2; +#else + int nn_outch = outch >> 1; + int remain_outch_start = nn_outch << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 2; + + float* output0_tm = top_blob_tm.channel(p); + float* output1_tm = top_blob_tm.channel(p + 1); + + const Mat kernel0_tm = kernel_tm.channel(p / 2); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 3 < tiles; i += 4) + { + const float* r0 = bb2.row(i / 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + float sum00 = 0.f; + float sum01 = 0.f; + float sum02 = 0.f; + float sum03 = 0.f; + float sum10 = 0.f; + float sum11 = 0.f; + float sum12 = 0.f; + float sum13 = 0.f; + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(k0 + 8); + float w0 = k0[0]; + float w1 = k0[1]; + sum00 += r0[0] * w0; + sum01 += r0[1] * w0; + sum02 += r0[2] * w0; + sum03 += r0[3] * w0; + sum10 += r0[0] * w1; + sum11 += r0[1] * w1; + sum12 += r0[2] * w1; + sum13 += r0[3] * w1; + + r0 += 4; + k0 += 2; + } + + output0_tm[0] = sum00; + output0_tm[1] = sum01; + output0_tm[2] = sum02; + output0_tm[3] = sum03; + output1_tm[0] = sum10; + output1_tm[1] = sum11; + output1_tm[2] = sum12; + output1_tm[3] = sum13; + + output0_tm += 4; + output1_tm += 4; + } + for (; i < tiles; i++) + { + const float* r0 = bb2.row(i / 4 + i % 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + float sum00 = 0.f; + float sum10 = 0.f; + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 4); + __builtin_prefetch(k0 + 8); + float val0 = r0[0]; + sum00 += val0 * k0[0]; + sum10 += val0 * k0[1]; + + r0 += 1; + k0 += 2; + } + + output0_tm[0] = sum00; + output1_tm[0] = sum10; + output0_tm++; + output1_tm++; + } + } + } +#endif + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + float* output0_tm = top_blob_tm.channel(p); + +#if __loongarch_sx + const Mat kernel0_tm = kernel_tm.channel(p / 8 + (p % 8) / 4 + p % 4); +#else + const Mat kernel0_tm = kernel_tm.channel(p / 2 + p % 2); +#endif + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 3 < tiles; i += 4) + { + const float* r0 = bb2.row(i / 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + int j = 0; +#if __loongarch_sx + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + + for (; j < nn; j++) + { + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vld(r0, 0), __lsx_vreplfr2vr_s(k0[0]), _sum0); + r0 += 4; + k0++; + } + + __lsx_vst(_sum0, output0_tm, 0); + output0_tm += 4; +#else // __loongarch_sx + float sum0 = 0.f; + float sum1 = 0.f; + float sum2 = 0.f; + float sum3 = 0.f; + + for (; j < nn; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(k0 + 4); + float w0 = k0[0]; + sum0 += r0[0] * w0; + sum1 += r0[1] * w0; + sum2 += r0[2] * w0; + sum3 += r0[3] * w0; + + r0 += 4; + k0++; + } + + output0_tm[0] = sum0; + output0_tm[1] = sum1; + output0_tm[2] = sum2; + output0_tm[3] = sum3; + output0_tm += 4; +#endif // __loongarch_sx + } + for (; i < tiles; i++) + { + const float* r0 = bb2.row(i / 4 + i % 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + float sum = 0.f; + + for (int j = 0; j < nn; j++) + { + float w0 = k0[0]; + float val0 = r0[0]; + sum += val0 * w0; + + r0 += 1; + k0 += 1; + } + + output0_tm[0] = sum; + output0_tm += 1; + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_dot_int8.h b/src/layer/loongarch/convolution_winograd_dot_int8.h new file mode 100644 index 000000000000..2ae5ce4f55eb --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_dot_int8.h @@ -0,0 +1,594 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_winograd_dot_int8_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt) +{ + // Mat bottom_blob_tm(tiles, 16/36/64, inch, 2u, 1, opt.workspace_allocator); + + const int tiles = bottom_blob_tm.w; + const int batch = bottom_blob_tm.h; + const int inch = bottom_blob_tm.c; + + // permute + Mat bottom_blob_tm2; +#if __loongarch_sx + if (inch >= 4) + { + if (tiles >= 2) + bottom_blob_tm2.create(inch / 4 + inch % 4, tiles / 2 + tiles % 2, batch, 16u, 8, opt.workspace_allocator); + else // if (tiles >= 1) + bottom_blob_tm2.create(inch / 4 + inch % 4, tiles, batch, 8u, 4, opt.workspace_allocator); + } + else +#endif // __loongarch_sx + { + if (tiles >= 2) + bottom_blob_tm2.create(inch, tiles / 2 + tiles % 2, batch, 4u, 2, opt.workspace_allocator); + else // if (tiles >= 1) + bottom_blob_tm2.create(inch, tiles, batch, 2u, 1, opt.workspace_allocator); + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int r = 0; r < batch; r++) + { + Mat tm2 = bottom_blob_tm2.channel(r); + + // tile + int i = 0; + for (; i + 1 < tiles; i += 2) + { + short* tmpptr = tm2.row(i / 2); + + const short* r0 = (const short*)bottom_blob_tm + r * tiles + i; + + int q = 0; +#if __loongarch_sx + const short* r1 = (const short*)bottom_blob_tm.channel(1) + r * tiles + i; + const short* r2 = (const short*)bottom_blob_tm.channel(2) + r * tiles + i; + const short* r3 = (const short*)bottom_blob_tm.channel(3) + r * tiles + i; + for (; q + 3 < inch; q += 4) + { + tmpptr[0] = r0[0]; + tmpptr[1] = r1[0]; + tmpptr[2] = r2[0]; + tmpptr[3] = r3[0]; + tmpptr[4] = r0[1]; + tmpptr[5] = r1[1]; + tmpptr[6] = r2[1]; + tmpptr[7] = r3[1]; + r0 += bottom_blob_tm.cstep * 4; + r1 += bottom_blob_tm.cstep * 4; + r2 += bottom_blob_tm.cstep * 4; + r3 += bottom_blob_tm.cstep * 4; + tmpptr += 8; + } +#endif // __loongarch_sx + for (; q < inch; q++) + { + tmpptr[0] = r0[0]; + tmpptr[1] = r0[1]; + r0 += bottom_blob_tm.cstep; + tmpptr += 2; + } + } + for (; i < tiles; i++) + { + short* tmpptr = tm2.row(i / 2 + i % 2); + + const short* r0 = (const short*)bottom_blob_tm + r * tiles + i; + + int q = 0; +#if __loongarch_sx + const short* r1 = (const short*)bottom_blob_tm.channel(1) + r * tiles + i; + const short* r2 = (const short*)bottom_blob_tm.channel(2) + r * tiles + i; + const short* r3 = (const short*)bottom_blob_tm.channel(3) + r * tiles + i; + for (; q + 3 < inch; q += 4) + { + tmpptr[0] = r0[0]; + tmpptr[1] = r1[0]; + tmpptr[2] = r2[0]; + tmpptr[3] = r3[0]; + r0 += bottom_blob_tm.cstep * 4; + r1 += bottom_blob_tm.cstep * 4; + r2 += bottom_blob_tm.cstep * 4; + r3 += bottom_blob_tm.cstep * 4; + tmpptr += 4; + } +#endif // __loongarch_sx + for (; q < inch; q++) + { + tmpptr[0] = r0[0]; + r0 += bottom_blob_tm.cstep; + tmpptr += 1; + } + } + } + + bottom_blob_tm = Mat(); + // permute end + + top_blob_tm.create(tiles, batch, outch, 4u, 1, opt.workspace_allocator); + +#if __loongarch_sx + int nn_outch = outch >> 2; + int remain_outch_start = nn_outch << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 4; + + int* output0_tm = top_blob_tm.channel(p); + int* output1_tm = top_blob_tm.channel(p + 1); + int* output2_tm = top_blob_tm.channel(p + 2); + int* output3_tm = top_blob_tm.channel(p + 3); + + const Mat kernel0_tm = kernel_tm.channel(p / 4); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 1 < tiles; i += 2) + { + const short* r0 = bb2.row(i / 2); + const short* k0 = kernel0_tm.row(r); + + int nn4 = inch / 4; + int nn1 = inch % 4; + + __m128i _sum00 = __lsx_vreplgr2vr_w(0); + __m128i _sum10 = __lsx_vreplgr2vr_w(0); + + if (nn4 > 0) + { + __m128i _sum01 = __lsx_vreplgr2vr_w(0); + __m128i _sum02 = __lsx_vreplgr2vr_w(0); + __m128i _sum03 = __lsx_vreplgr2vr_w(0); + __m128i _sum11 = __lsx_vreplgr2vr_w(0); + __m128i _sum12 = __lsx_vreplgr2vr_w(0); + __m128i _sum13 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __m128i _val01 = __lsx_vld(r0, 0); + + __m128i _val0 = __lsx_vilvl_d(_val01, _val01); + __m128i _val1 = __lsx_vilvh_d(_val01, _val01); + + __m128i _w0 = __lsx_vld(k0, 0); + __m128i _w1 = __lsx_vld(k0 + 8, 0); + + __m128i _extval0 = __lsx_vslti_h(_val0, 0); + __m128i _extval1 = __lsx_vslti_h(_val1, 0); + __m128i _extw0 = __lsx_vslti_h(_w0, 0); + __m128i _extw1 = __lsx_vslti_h(_w1, 0); + + __m128i _val0l = __lsx_vilvl_h(_extval0, _val0); + __m128i _val0h = __lsx_vilvh_h(_extval0, _val0); + __m128i _val1l = __lsx_vilvl_h(_extval1, _val1); + __m128i _val1h = __lsx_vilvh_h(_extval1, _val1); + + __m128i _w0l = __lsx_vilvl_h(_extw0, _w0); + __m128i _w0h = __lsx_vilvh_h(_extw0, _w0); + __m128i _w1l = __lsx_vilvl_h(_extw1, _w1); + __m128i _w1h = __lsx_vilvh_h(_extw1, _w1); + + _sum00 = __lsx_vmadd_w(_sum00, _val0l, _w0l); + _sum01 = __lsx_vmadd_w(_sum01, _val0h, _w0h); + _sum02 = __lsx_vmadd_w(_sum02, _val0l, _w1l); + _sum03 = __lsx_vmadd_w(_sum03, _val0h, _w1h); + _sum10 = __lsx_vmadd_w(_sum10, _val1l, _w0l); + _sum11 = __lsx_vmadd_w(_sum11, _val1h, _w0h); + _sum12 = __lsx_vmadd_w(_sum12, _val1l, _w1l); + _sum13 = __lsx_vmadd_w(_sum13, _val1h, _w1h); + + r0 += 8; + k0 += 16; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum01, _sum00); + _tmp1 = __lsx_vilvl_w(_sum03, _sum02); + _tmp2 = __lsx_vilvh_w(_sum01, _sum00); + _tmp3 = __lsx_vilvh_w(_sum03, _sum02); + _sum00 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum01 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum02 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum03 = __lsx_vilvh_d(_tmp3, _tmp2); + } + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum11, _sum10); + _tmp1 = __lsx_vilvl_w(_sum13, _sum12); + _tmp2 = __lsx_vilvh_w(_sum11, _sum10); + _tmp3 = __lsx_vilvh_w(_sum13, _sum12); + _sum10 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum11 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum12 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum13 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum00 = __lsx_vadd_w(_sum00, _sum01); + _sum02 = __lsx_vadd_w(_sum02, _sum03); + _sum10 = __lsx_vadd_w(_sum10, _sum11); + _sum12 = __lsx_vadd_w(_sum12, _sum13); + + _sum00 = __lsx_vadd_w(_sum00, _sum02); + _sum10 = __lsx_vadd_w(_sum10, _sum12); + } + + for (int j = 0; j < nn1; j++) + { + __m128i _val0 = __lsx_vreplgr2vr_h(r0[0]); + __m128i _val1 = __lsx_vreplgr2vr_h(r0[1]); + __m128i _val = __lsx_vilvl_d(_val1, _val0); + + __m128i _w16 = __lsx_vld(k0, 0); + + _w16 = __lsx_vilvl_d(_w16, _w16); + + __m128i _extval = __lsx_vslti_h(_val, 0); + __m128i _extw16 = __lsx_vslti_h(_w16, 0); + + __m128i _vall = __lsx_vilvl_h(_extval, _val); + __m128i _valh = __lsx_vilvh_h(_extval, _val); + __m128i _w0l = __lsx_vilvl_h(_extw16, _w16); + __m128i _w0h = __lsx_vilvh_h(_extw16, _w16); + + _sum00 = __lsx_vmadd_w(_sum00, _vall, _w0l); + _sum10 = __lsx_vmadd_w(_sum10, _valh, _w0h); + + r0 += 2; + k0 += 4; + } + + int sum[8]; + __lsx_vst(_sum00, sum, 0); + __lsx_vst(_sum10, sum + 4, 0); + + output0_tm[0] = sum[0]; + output1_tm[0] = sum[1]; + output2_tm[0] = sum[2]; + output3_tm[0] = sum[3]; + output0_tm[1] = sum[4]; + output1_tm[1] = sum[5]; + output2_tm[1] = sum[6]; + output3_tm[1] = sum[7]; + output0_tm += 2; + output1_tm += 2; + output2_tm += 2; + output3_tm += 2; + } + for (; i < tiles; i++) + { + const short* r0 = bb2.row(i / 2 + i % 2); + const short* k0 = kernel0_tm.row(r); + + int nn4 = inch / 4; + int nn1 = inch % 4; + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + + if (nn4 > 0) + { + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __m128i _val16 = __lsx_vld(r0, 0); + + _val16 = __lsx_vilvl_d(_val16, _val16); + + __m128i _w0 = __lsx_vld(k0, 0); + __m128i _w1 = __lsx_vld(k0 + 8, 0); + + __m128i _extval16 = __lsx_vslti_h(_val16, 0); + __m128i _extw0 = __lsx_vslti_h(_w0, 0); + __m128i _extw1 = __lsx_vslti_h(_w1, 0); + + __m128i _val0l = __lsx_vilvl_h(_extval16, _val16); + __m128i _val0h = __lsx_vilvh_h(_extval16, _val16); + + __m128i _w0l = __lsx_vilvl_h(_extw0, _w0); + __m128i _w0h = __lsx_vilvh_h(_extw0, _w0); + __m128i _w1l = __lsx_vilvl_h(_extw1, _w1); + __m128i _w1h = __lsx_vilvh_h(_extw1, _w1); + + _sum0 = __lsx_vmadd_w(_sum0, _val0l, _w0l); + _sum1 = __lsx_vmadd_w(_sum1, _val0h, _w0h); + _sum2 = __lsx_vmadd_w(_sum2, _val0l, _w1l); + _sum3 = __lsx_vmadd_w(_sum3, _val0h, _w1h); + + r0 += 4; + k0 += 16; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum1, _sum0); + _tmp1 = __lsx_vilvl_w(_sum3, _sum2); + _tmp2 = __lsx_vilvh_w(_sum1, _sum0); + _tmp3 = __lsx_vilvh_w(_sum3, _sum2); + _sum0 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum1 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum2 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum3 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + _sum0 = __lsx_vadd_w(_sum0, _sum2); + } + + for (int j = 0; j < nn1; j++) + { + __m128i _val = __lsx_vreplgr2vr_w(r0[0]); + __m128i _w16 = __lsx_vld(k0, 0); + + __m128i _extw16 = __lsx_vslti_h(_w16, 0); + __m128i _w0l = __lsx_vilvl_h(_extw16, _w16); + + _sum0 = __lsx_vmadd_w(_sum0, _val, _w0l); + + r0 += 1; + k0 += 4; + } + + int sum[4]; + __lsx_vst(_sum0, sum, 0); + + output0_tm[0] = sum[0]; + output1_tm[0] = sum[1]; + output2_tm[0] = sum[2]; + output3_tm[0] = sum[3]; + output0_tm += 1; + output1_tm += 1; + output2_tm += 1; + output3_tm += 1; + } + } + } +#else // __loongarch_sx + int nn_outch = outch >> 1; + int remain_outch_start = nn_outch << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 2; + + int* output0_tm = top_blob_tm.channel(p); + int* output1_tm = top_blob_tm.channel(p + 1); + + const Mat kernel0_tm = kernel_tm.channel(p / 2); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 1 < tiles; i += 2) + { + const short* r0 = bb2.row(i / 2); + const short* k0 = kernel0_tm.row(r); + + int sum00 = 0; + int sum01 = 0; + int sum10 = 0; + int sum11 = 0; + + int nn1 = inch; + + for (int j = 0; j < nn1; j++) + { + signed short val0 = r0[0]; + signed short val1 = r0[1]; + signed short w0 = k0[0]; + signed short w1 = k0[1]; + + sum00 += val0 * w0; + sum01 += val1 * w0; + sum10 += val0 * w1; + sum11 += val1 * w1; + + r0 += 2; + k0 += 2; + } + + output0_tm[0] = sum00; + output0_tm[1] = sum01; + output1_tm[0] = sum10; + output1_tm[1] = sum11; + output0_tm += 2; + output1_tm += 2; + } + for (; i < tiles; i++) + { + const short* r0 = bb2.row(i / 2 + i % 2); + const short* k0 = kernel0_tm.row(r); + + int sum0 = 0; + int sum1 = 0; + + int nn1 = inch; + + for (int j = 0; j < nn1; j++) + { + signed short val0 = r0[0]; + signed short w0 = k0[0]; + signed short w1 = k0[1]; + + sum0 += val0 * w0; + sum1 += val0 * w1; + + r0 += 1; + k0 += 2; + } + + output0_tm[0] = sum0; + output1_tm[0] = sum1; + output0_tm += 1; + output1_tm += 1; + } + } + } +#endif // __loongarch_sx + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + int* output0_tm = top_blob_tm.channel(p); + +#if __loongarch_sx + const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4); +#else + const Mat kernel0_tm = kernel_tm.channel(p / 2 + p % 2); +#endif + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 1 < tiles; i += 2) + { + const short* r0 = bb2.row(i / 2); + const short* k0 = kernel0_tm.row(r); + + int sum0 = 0; + int sum1 = 0; + +#if __loongarch_sx + int nn4 = inch / 4; + int nn1 = inch % 4; + + if (nn4 > 0) + { + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __m128i _val16 = __lsx_vld(r0, 0); + + __m128i _w16 = __lsx_vld(k0, 0); + + _w16 = __lsx_vilvl_d(_w16, _w16); + + __m128i _extval16 = __lsx_vslti_h(_val16, 0); + __m128i _extw16 = __lsx_vslti_h(_w16, 0); + + __m128i _val0l = __lsx_vilvl_h(_extval16, _val16); + __m128i _val0h = __lsx_vilvh_h(_extval16, _val16); + + __m128i _w0l = __lsx_vilvl_h(_extw16, _w16); + __m128i _w0h = __lsx_vilvh_h(_extw16, _w16); + + _sum0 = __lsx_vmadd_w(_sum0, _val0l, _w0l); + _sum1 = __lsx_vmadd_w(_sum1, _val0h, _w0h); + + r0 += 8; + k0 += 4; + } + + sum0 = __lsx_reduce_add_w(_sum0); + sum1 = __lsx_reduce_add_w(_sum1); + } +#else // __loongarch_sx + int nn1 = inch; +#endif // __loongarch_sx + + for (int q = 0; q < nn1; q++) + { + signed short val0 = r0[0]; + signed short val1 = r0[1]; + signed short w = k0[0]; + + sum0 += val0 * w; + sum1 += val1 * w; + + k0 += 1; + r0 += 2; + } + + output0_tm[0] = sum0; + output0_tm[1] = sum1; + output0_tm += 2; + } + for (; i < tiles; i++) + { + const short* r0 = bb2.row(i / 2 + i % 2); + const short* k0 = kernel0_tm.row(r); + + int sum = 0; + +#if __loongarch_sx + int nn4 = inch / 4; + int nn1 = inch % 4; + + if (nn4 > 0) + { + __m128i _sum = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __m128i _val16 = __lsx_vld(r0, 0); + __m128i _w16 = __lsx_vld(k0, 0); + + __m128i _extval16 = __lsx_vslti_h(_val16, 0); + __m128i _extw16 = __lsx_vslti_h(_w16, 0); + + __m128i _val0l = __lsx_vilvl_h(_extval16, _val16); + __m128i _w0l = __lsx_vilvl_h(_extw16, _w16); + + _sum = __lsx_vmadd_w(_sum, _val0l, _w0l); + + r0 += 4; + k0 += 4; + } + + sum = __lsx_reduce_add_w(_sum); + } +#else // __loongarch_sx + int nn1 = inch; +#endif // __loongarch_sx + + for (int q = 0; q < nn1; q++) + { + signed short val = r0[0]; + signed short w = k0[0]; + + sum += val * w; + + k0 += 1; + r0 += 1; + } + + output0_tm[0] = sum; + output0_tm++; + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_dot_pack4.h b/src/layer/loongarch/convolution_winograd_dot_pack4.h new file mode 100644 index 000000000000..66002a62a625 --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_dot_pack4.h @@ -0,0 +1,448 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_winograd_dot_pack4_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt) +{ + // Mat bottom_blob_tm(tiles, 16/36/64, inch, 16u, 4, opt.workspace_allocator); + + const int tiles = bottom_blob_tm.w; + const int batch = bottom_blob_tm.h; + const int inch = bottom_blob_tm.c; + + // permute + Mat bottom_blob_tm2; + if (tiles >= 12) + bottom_blob_tm2.create(12 * inch, tiles / 12 + (tiles % 12) / 8 + (tiles % 12 % 8) / 4 + (tiles % 12 % 4) / 2 + tiles % 12 % 2, batch, 16u, 4, opt.workspace_allocator); + else if (tiles >= 8) + bottom_blob_tm2.create(8 * inch, tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2, batch, 16u, 4, opt.workspace_allocator); + else if (tiles >= 4) + bottom_blob_tm2.create(4 * inch, tiles / 4 + (tiles % 4) / 2 + tiles % 2, batch, 16u, 4, opt.workspace_allocator); + else if (tiles >= 2) + bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, batch, 16u, 4, opt.workspace_allocator); + else // if (tiles >= 1) + bottom_blob_tm2.create(1 * inch, tiles, batch, 16u, 4, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int r = 0; r < batch; r++) + { + Mat tm2 = bottom_blob_tm2.channel(r); + + // tile + int i = 0; + for (; i + 11 < tiles; i += 12) + { + float* tmpptr = tm2.row(i / 12); + + const float* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 4; + + for (int q = 0; q < inch; q++) + { + // transpose 4x8 + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r0 + 4, 0); + __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0); + __m128i _r4 = __lsx_vld(r0 + 4 * 4, 0); + __m128i _r5 = __lsx_vld(r0 + 4 * 5, 0); + __m128i _r6 = __lsx_vld(r0 + 4 * 6, 0); + __m128i _r7 = __lsx_vld(r0 + 4 * 7, 0); + __m128i _r8 = __lsx_vld(r0 + 4 * 8, 0); + __m128i _r9 = __lsx_vld(r0 + 4 * 9, 0); + __m128i _ra = __lsx_vld(r0 + 4 * 10, 0); + __m128i _rb = __lsx_vld(r0 + 4 * 11, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r45r = __lsx_vilvl_w(_r5, _r4); + __m128i _r45l = __lsx_vilvh_w(_r5, _r4); + __m128i _r67r = __lsx_vilvl_w(_r7, _r6); + __m128i _r67l = __lsx_vilvh_w(_r7, _r6); + __m128i _r89r = __lsx_vilvl_w(_r9, _r8); + __m128i _r89l = __lsx_vilvh_w(_r9, _r8); + __m128i _rabr = __lsx_vilvl_w(_rb, _ra); + __m128i _rabl = __lsx_vilvh_w(_rb, _ra); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r); + __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r); + __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l); + __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l); + __m128i _r89ab_0 = __lsx_vilvl_d(_rabr, _r89r); + __m128i _r89ab_1 = __lsx_vilvh_d(_rabr, _r89r); + __m128i _r89ab_2 = __lsx_vilvl_d(_rabl, _r89l); + __m128i _r89ab_3 = __lsx_vilvh_d(_rabl, _r89l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r4567_0, tmpptr + 4, 0); + __lsx_vst(_r89ab_0, tmpptr + 4 * 2, 0); + __lsx_vst(_r0123_1, tmpptr + 4 * 3, 0); + __lsx_vst(_r4567_1, tmpptr + 4 * 4, 0); + __lsx_vst(_r89ab_1, tmpptr + 4 * 5, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 6, 0); + __lsx_vst(_r4567_2, tmpptr + 4 * 7, 0); + __lsx_vst(_r89ab_2, tmpptr + 4 * 8, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 9, 0); + __lsx_vst(_r4567_3, tmpptr + 4 * 10, 0); + __lsx_vst(_r89ab_3, tmpptr + 4 * 11, 0); + + r0 += bottom_blob_tm.cstep * 4; + tmpptr += 48; + } + } + for (; i + 7 < tiles; i += 8) + { + float* tmpptr = tm2.row(i / 12 + (i % 12) / 8); + + const float* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 4; + + for (int q = 0; q < inch; q++) + { + // transpose 4x8 + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r0 + 4, 0); + __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0); + __m128i _r4 = __lsx_vld(r0 + 4 * 4, 0); + __m128i _r5 = __lsx_vld(r0 + 4 * 5, 0); + __m128i _r6 = __lsx_vld(r0 + 4 * 6, 0); + __m128i _r7 = __lsx_vld(r0 + 4 * 7, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r45r = __lsx_vilvl_w(_r5, _r4); + __m128i _r45l = __lsx_vilvh_w(_r5, _r4); + __m128i _r67r = __lsx_vilvl_w(_r7, _r6); + __m128i _r67l = __lsx_vilvh_w(_r7, _r6); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r); + __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r); + __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l); + __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r4567_0, tmpptr + 4, 0); + __lsx_vst(_r0123_1, tmpptr + 4 * 2, 0); + __lsx_vst(_r4567_1, tmpptr + 4 * 3, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 4, 0); + __lsx_vst(_r4567_2, tmpptr + 4 * 5, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 6, 0); + __lsx_vst(_r4567_3, tmpptr + 4 * 7, 0); + + r0 += bottom_blob_tm.cstep * 4; + tmpptr += 32; + } + } + for (; i + 3 < tiles; i += 4) + { + float* tmpptr = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4); + + const float* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 4; + + for (int q = 0; q < inch; q++) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r0 + 4, 0); + __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r0123_1, tmpptr + 4, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 2, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 3, 0); + + r0 += bottom_blob_tm.cstep * 4; + tmpptr += 16; + } + } + for (; i + 1 < tiles; i += 2) + { + float* tmpptr = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2); + + const float* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 4; + + for (int q = 0; q < inch; q++) + { + // transpose 4x2 + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r0 + 4, 0); + + __m128i _r01_0 = __lsx_vilvl_w(_r1, _r0); + __m128i _r01_1 = __lsx_vilvh_w(_r1, _r0); + + __lsx_vst(_r01_0, tmpptr, 0); + __lsx_vst(_r01_1, tmpptr + 4, 0); + + r0 += bottom_blob_tm.cstep * 4; + tmpptr += 8; + } + } + for (; i < tiles; i++) + { + float* tmpptr = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2); + + const float* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 4; + + for (int q = 0; q < inch; q++) + { + __m128i _val = __lsx_vld(r0, 0); + __lsx_vst(_val, tmpptr, 0); + + r0 += bottom_blob_tm.cstep * 4; + tmpptr += 4; + } + } + } + + bottom_blob_tm = Mat(); + // permute end + + top_blob_tm.create(tiles, batch, outch, 16u, 4, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + float* output0_tm = top_blob_tm.channel(p); + + const Mat kernel0_tm = kernel_tm.channel(p); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 11 < tiles; i += 12) + { + const float* r0 = bb2.row(i / 12); + const float* k0 = kernel0_tm.row(r); + + int nn = inch * 4; // inch always > 0 + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum4 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum5 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum6 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum7 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum8 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum9 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _suma = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sumb = (__m128)__lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 48); + __builtin_prefetch(k0 + 16); + __m128i _val0123 = __lsx_vld(r0, 0); + __m128i _val4567 = __lsx_vld(r0 + 4, 0); + __m128i _val89ab = __lsx_vld(r0 + 8, 0); + __m128 _w0 = (__m128)__lsx_vld(k0, 0); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3); + _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4); + _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5); + _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6); + _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7); + _sum8 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 0), _sum8); + _sum9 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 1), _sum9); + _suma = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 2), _suma); + _sumb = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 3), _sumb); + + r0 += 12; + k0 += 4; + } + + __lsx_vst(_sum0, output0_tm, 0); + __lsx_vst(_sum1, output0_tm + 4, 0); + __lsx_vst(_sum2, output0_tm + 4 * 2, 0); + __lsx_vst(_sum3, output0_tm + 4 * 3, 0); + __lsx_vst(_sum4, output0_tm + 4 * 4, 0); + __lsx_vst(_sum5, output0_tm + 4 * 5, 0); + __lsx_vst(_sum6, output0_tm + 4 * 6, 0); + __lsx_vst(_sum7, output0_tm + 4 * 7, 0); + __lsx_vst(_sum8, output0_tm + 4 * 8, 0); + __lsx_vst(_sum9, output0_tm + 4 * 9, 0); + __lsx_vst(_suma, output0_tm + 4 * 10, 0); + __lsx_vst(_sumb, output0_tm + 4 * 11, 0); + + output0_tm += 4 * 12; + } + for (; i + 7 < tiles; i += 8) + { + const float* r0 = bb2.row(i / 12 + (i % 12) / 8); + const float* k0 = kernel0_tm.row(r); + + int nn = inch * 4; // inch always > 0 + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum4 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum5 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum6 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum7 = (__m128)__lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(k0 + 16); + __m128i _val0123 = __lsx_vld(r0, 0); + __m128i _val4567 = __lsx_vld(r0 + 4, 0); + __m128 _w0 = (__m128)__lsx_vld(k0, 0); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3); + _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4); + _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5); + _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6); + _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7); + + r0 += 8; + k0 += 4; + } + + __lsx_vst(_sum0, output0_tm, 0); + __lsx_vst(_sum1, output0_tm + 4, 0); + __lsx_vst(_sum2, output0_tm + 4 * 2, 0); + __lsx_vst(_sum3, output0_tm + 4 * 3, 0); + __lsx_vst(_sum4, output0_tm + 4 * 4, 0); + __lsx_vst(_sum5, output0_tm + 4 * 5, 0); + __lsx_vst(_sum6, output0_tm + 4 * 6, 0); + __lsx_vst(_sum7, output0_tm + 4 * 7, 0); + + output0_tm += 4 * 8; + } + for (; i + 3 < tiles; i += 4) + { + const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch * 4; // inch always > 0 + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(k0 + 16); + __m128i _val0123 = __lsx_vld(r0, 0); + __m128 _w0 = (__m128)__lsx_vld(k0, 0); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3); + + r0 += 4; + k0 += 4; + } + + __lsx_vst(_sum0, output0_tm, 0); + __lsx_vst(_sum1, output0_tm + 4, 0); + __lsx_vst(_sum2, output0_tm + 4 * 2, 0); + __lsx_vst(_sum3, output0_tm + 4 * 3, 0); + + output0_tm += 4 * 4; + } + for (; i + 1 < tiles; i += 2) + { + const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2); + const float* k0 = kernel0_tm.row(r); + + int nn = inch * 4; // inch always > 0 + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 8); + __builtin_prefetch(k0 + 16); + __m128 _val0 = __lsx_vreplfr2vr_s(*r0++); + __m128 _val1 = __lsx_vreplfr2vr_s(*r0++); + __m128 _w0 = (__m128)__lsx_vld(k0, 0); + _sum0 = __lsx_vfmadd_s(_w0, _val0, _sum0); + _sum1 = __lsx_vfmadd_s(_w0, _val1, _sum1); + + k0 += 4; + } + + __lsx_vst(_sum0, output0_tm, 0); + __lsx_vst(_sum1, output0_tm + 4, 0); + + output0_tm += 4 * 2; + } + for (; i < tiles; i++) + { + const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2); + const float* k0 = kernel0_tm.row(r); + + int nn = inch * 4; // inch always > 0 + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 4); + __builtin_prefetch(k0 + 16); + __m128 _val0 = __lsx_vreplfr2vr_s(*r0++); + __m128 _w0 = (__m128)__lsx_vld(k0, 0); + _sum = __lsx_vfmadd_s(_w0, _val0, _sum); + + k0 += 4; + } + + __lsx_vst(_sum, output0_tm, 0); + + output0_tm += 4; + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_dot_pack8to1_int8.h b/src/layer/loongarch/convolution_winograd_dot_pack8to1_int8.h new file mode 100644 index 000000000000..f87aa9ef558a --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_dot_pack8to1_int8.h @@ -0,0 +1,363 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_winograd_dot_pack8to1_int8_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt) +{ + // Mat bottom_blob_tm(tiles, 16/36/64, inch, 16u, 8, opt.workspace_allocator); + + const int tiles = bottom_blob_tm.w; + const int batch = bottom_blob_tm.h; + const int inch = bottom_blob_tm.c; + + // permute + Mat bottom_blob_tm2; + if (tiles >= 2) + bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, batch, 16u, 8, opt.workspace_allocator); + else // if (tiles >= 1) + bottom_blob_tm2.create(1 * inch, tiles, batch, 16u, 8, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int r = 0; r < batch; r++) + { + Mat tm2 = bottom_blob_tm2.channel(r); + + // tile + int i = 0; + for (; i + 1 < tiles; i += 2) + { + short* tmpptr = tm2.row(i / 2); + + const short* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 8; + + for (int q = 0; q < inch; q++) + { + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r0 + 8, 0); + __lsx_vst(_r0, tmpptr, 0); + __lsx_vst(_r1, tmpptr + 8, 0); + r0 += bottom_blob_tm.cstep * 8; + tmpptr += 16; + } + } + for (; i < tiles; i++) + { + short* tmpptr = tm2.row(i / 2 + i % 2); + + const short* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 8; + + for (int q = 0; q < inch; q++) + { + __m128i _r0 = __lsx_vld(r0, 0); + __lsx_vst(_r0, tmpptr, 0); + r0 += bottom_blob_tm.cstep * 8; + tmpptr += 8; + } + } + } + + bottom_blob_tm = Mat(); + // permute end + + top_blob_tm.create(tiles, batch, outch, 4u, 1, opt.workspace_allocator); + + int nn_outch = 0; + int remain_outch_start = 0; + + nn_outch = outch >> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 4; + + int* output0_tm = top_blob_tm.channel(p); + int* output1_tm = top_blob_tm.channel(p + 1); + int* output2_tm = top_blob_tm.channel(p + 2); + int* output3_tm = top_blob_tm.channel(p + 3); + + const Mat kernel0_tm = kernel_tm.channel(p / 4); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 1 < tiles; i += 2) + { + const short* r0 = bb2.row(i / 2); + const short* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 64); + __builtin_prefetch(k0 + 128); + __m128i _w0 = __lsx_vld(k0, 0); + __m128i _w1 = __lsx_vld(k0 + 8, 0); + __m128i _w2 = __lsx_vld(k0 + 16, 0); + __m128i _w3 = __lsx_vld(k0 + 24, 0); + + __m128i _extw0 = __lsx_vslti_h(_w0, 0); + __m128i _extw1 = __lsx_vslti_h(_w1, 0); + __m128i _extw2 = __lsx_vslti_h(_w2, 0); + __m128i _extw3 = __lsx_vslti_h(_w3, 0); + + __m128i _w0l = __lsx_vilvl_h(_extw0, _w0); + __m128i _w0h = __lsx_vilvh_h(_extw0, _w0); + __m128i _w1l = __lsx_vilvl_h(_extw1, _w1); + __m128i _w1h = __lsx_vilvh_h(_extw1, _w1); + __m128i _w2l = __lsx_vilvl_h(_extw2, _w2); + __m128i _w2h = __lsx_vilvh_h(_extw2, _w2); + __m128i _w3l = __lsx_vilvl_h(_extw3, _w3); + __m128i _w3h = __lsx_vilvh_h(_extw3, _w3); + + __m128i _val0_0 = __lsx_vreplgr2vr_w(r0[0]); + __m128i _val0_1 = __lsx_vreplgr2vr_w(r0[1]); + __m128i _val0_2 = __lsx_vreplgr2vr_w(r0[2]); + __m128i _val0_3 = __lsx_vreplgr2vr_w(r0[3]); + __m128i _val0_4 = __lsx_vreplgr2vr_w(r0[4]); + __m128i _val0_5 = __lsx_vreplgr2vr_w(r0[5]); + __m128i _val0_6 = __lsx_vreplgr2vr_w(r0[6]); + __m128i _val0_7 = __lsx_vreplgr2vr_w(r0[7]); + __m128i _val1_0 = __lsx_vreplgr2vr_w(r0[8]); + __m128i _val1_1 = __lsx_vreplgr2vr_w(r0[9]); + __m128i _val1_2 = __lsx_vreplgr2vr_w(r0[10]); + __m128i _val1_3 = __lsx_vreplgr2vr_w(r0[11]); + __m128i _val1_4 = __lsx_vreplgr2vr_w(r0[12]); + __m128i _val1_5 = __lsx_vreplgr2vr_w(r0[13]); + __m128i _val1_6 = __lsx_vreplgr2vr_w(r0[14]); + __m128i _val1_7 = __lsx_vreplgr2vr_w(r0[15]); + + _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0_0); + _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val0_1); + _sum2 = __lsx_vmadd_w(_sum2, _w0l, _val1_0); + _sum3 = __lsx_vmadd_w(_sum3, _w0h, _val1_1); + _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val0_2); + _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val0_3); + _sum2 = __lsx_vmadd_w(_sum2, _w1l, _val1_2); + _sum3 = __lsx_vmadd_w(_sum3, _w1h, _val1_3); + _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val0_4); + _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val0_5); + _sum2 = __lsx_vmadd_w(_sum2, _w2l, _val1_4); + _sum3 = __lsx_vmadd_w(_sum3, _w2h, _val1_5); + _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val0_6); + _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val0_7); + _sum2 = __lsx_vmadd_w(_sum2, _w3l, _val1_6); + _sum3 = __lsx_vmadd_w(_sum3, _w3h, _val1_7); + + r0 += 16; + k0 += 32; + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + + int sum[8]; + __lsx_vst(_sum0, sum, 0); + __lsx_vst(_sum2, sum + 4, 0); + + output0_tm[0] = sum[0]; + output1_tm[0] = sum[1]; + output2_tm[0] = sum[2]; + output3_tm[0] = sum[3]; + output0_tm[1] = sum[4]; + output1_tm[1] = sum[5]; + output2_tm[1] = sum[6]; + output3_tm[1] = sum[7]; + output0_tm += 2; + output1_tm += 2; + output2_tm += 2; + output3_tm += 2; + } + for (; i < tiles; i++) + { + const short* r0 = bb2.row(i / 2 + i % 2); + const short* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(k0 + 128); + __m128i _w0 = __lsx_vld(k0, 0); + __m128i _w1 = __lsx_vld(k0 + 8, 0); + __m128i _w2 = __lsx_vld(k0 + 16, 0); + __m128i _w3 = __lsx_vld(k0 + 24, 0); + + __m128i _extw0 = __lsx_vslti_h(_w0, 0); + __m128i _extw1 = __lsx_vslti_h(_w1, 0); + __m128i _extw2 = __lsx_vslti_h(_w2, 0); + __m128i _extw3 = __lsx_vslti_h(_w3, 0); + + __m128i _w0l = __lsx_vilvl_h(_extw0, _w0); + __m128i _w0h = __lsx_vilvh_h(_extw0, _w0); + __m128i _w1l = __lsx_vilvl_h(_extw1, _w1); + __m128i _w1h = __lsx_vilvh_h(_extw1, _w1); + __m128i _w2l = __lsx_vilvl_h(_extw2, _w2); + __m128i _w2h = __lsx_vilvh_h(_extw2, _w2); + __m128i _w3l = __lsx_vilvl_h(_extw3, _w3); + __m128i _w3h = __lsx_vilvh_h(_extw3, _w3); + + __m128i _val0 = __lsx_vreplgr2vr_w(r0[0]); + __m128i _val1 = __lsx_vreplgr2vr_w(r0[1]); + __m128i _val2 = __lsx_vreplgr2vr_w(r0[2]); + __m128i _val3 = __lsx_vreplgr2vr_w(r0[3]); + __m128i _val4 = __lsx_vreplgr2vr_w(r0[4]); + __m128i _val5 = __lsx_vreplgr2vr_w(r0[5]); + __m128i _val6 = __lsx_vreplgr2vr_w(r0[6]); + __m128i _val7 = __lsx_vreplgr2vr_w(r0[7]); + + _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0); + _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val1); + _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val2); + _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val3); + _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val4); + _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val5); + _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val6); + _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val7); + + r0 += 8; + k0 += 32; + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + + int sum[4]; + __lsx_vst(_sum0, sum, 0); + + output0_tm[0] = sum[0]; + output1_tm[0] = sum[1]; + output2_tm[0] = sum[2]; + output3_tm[0] = sum[3]; + output0_tm += 1; + output1_tm += 1; + output2_tm += 1; + output3_tm += 1; + } + } + } + + remain_outch_start += nn_outch << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + int* output0_tm = top_blob_tm.channel(p); + + const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 1 < tiles; i += 2) + { + const short* r0 = bb2.row(i / 2); + const short* k0 = kernel0_tm.row(r); + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + for (int q = 0; q < inch; q++) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(k0 + 64); + __m128i _val0 = __lsx_vld(r0, 0); + __m128i _val1 = __lsx_vld(r0 + 8, 0); + + __m128i _extval0 = __lsx_vslti_h(_val0, 0); + __m128i _extval1 = __lsx_vslti_h(_val1, 0); + __m128i _val0l = __lsx_vilvl_h(_extval0, _val0); + __m128i _val0h = __lsx_vilvh_h(_extval0, _val0); + __m128i _val1l = __lsx_vilvl_h(_extval1, _val1); + __m128i _val1h = __lsx_vilvh_h(_extval1, _val1); + + __m128i _w0 = __lsx_vld(k0, 0); + + __m128i _extw0 = __lsx_vslti_h(_w0, 0); + __m128i _w0l = __lsx_vilvl_h(_extw0, _w0); + __m128i _w0h = __lsx_vilvh_h(_extw0, _w0); + + _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0l); + _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val0h); + _sum2 = __lsx_vmadd_w(_sum2, _w0l, _val1l); + _sum3 = __lsx_vmadd_w(_sum3, _w0h, _val1h); + + k0 += 8; + r0 += 16; + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + + output0_tm[0] = __lsx_reduce_add_w(_sum0); + output0_tm[1] = __lsx_reduce_add_w(_sum2); + output0_tm += 2; + } + for (; i < tiles; i++) + { + const short* r0 = bb2.row(i / 2 + i % 2); + const short* k0 = kernel0_tm.row(r); + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + for (int q = 0; q < inch; q++) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(k0 + 32); + __m128i _val = __lsx_vld(r0, 0); + + __m128i _extval = __lsx_vslti_h(_val, 0); + __m128i _vall = __lsx_vilvl_h(_extval, _val); + __m128i _valh = __lsx_vilvh_h(_extval, _val); + + __m128i _w0 = __lsx_vld(k0, 0); + + __m128i _extw0 = __lsx_vslti_h(_w0, 0); + __m128i _w0l = __lsx_vilvl_h(_extw0, _w0); + __m128i _w0h = __lsx_vilvh_h(_extw0, _w0); + + _sum0 = __lsx_vmadd_w(_sum0, _w0l, _vall); + _sum1 = __lsx_vmadd_w(_sum1, _w0h, _valh); + + k0 += 8; + r0 += 8; + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + + output0_tm[0] = __lsx_reduce_add_w(_sum0); + output0_tm++; + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_dot_pack8to4_int8.h b/src/layer/loongarch/convolution_winograd_dot_pack8to4_int8.h new file mode 100644 index 000000000000..c20400cbf8c3 --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_dot_pack8to4_int8.h @@ -0,0 +1,233 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_winograd_dot_pack8to4_int8_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt) +{ + // Mat bottom_blob_tm(tiles, 16/36/64, inch, 16u, 8, opt.workspace_allocator); + + const int tiles = bottom_blob_tm.w; + const int batch = bottom_blob_tm.h; + const int inch = bottom_blob_tm.c; + + // permute + Mat bottom_blob_tm2; + if (tiles >= 2) + bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, batch, 16u, 8, opt.workspace_allocator); + else // if (tiles >= 1) + bottom_blob_tm2.create(1 * inch, tiles, batch, 16u, 8, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int r = 0; r < batch; r++) + { + Mat tm2 = bottom_blob_tm2.channel(r); + + // tile + int i = 0; + for (; i + 1 < tiles; i += 2) + { + short* tmpptr = tm2.row(i / 2); + + const short* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 8; + + for (int q = 0; q < inch; q++) + { + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r0 + 8, 0); + __lsx_vst(_r0, tmpptr, 0); + __lsx_vst(_r1, tmpptr + 8, 0); + r0 += bottom_blob_tm.cstep * 8; + tmpptr += 16; + } + } + for (; i < tiles; i++) + { + short* tmpptr = tm2.row(i / 2 + i % 2); + + const short* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 8; + + for (int q = 0; q < inch; q++) + { + __m128i _r0 = __lsx_vld(r0, 0); + __lsx_vst(_r0, tmpptr, 0); + r0 += bottom_blob_tm.cstep * 8; + tmpptr += 8; + } + } + } + + bottom_blob_tm = Mat(); + // permute end + + top_blob_tm.create(tiles, batch, outch, 16u, 4, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* output0_tm = top_blob_tm.channel(p); + + const Mat kernel0_tm = kernel_tm.channel(p); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 1 < tiles; i += 2) + { + const short* r0 = bb2.row(i / 2); + const short* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 64); + __builtin_prefetch(k0 + 128); + __m128i _w0 = __lsx_vld(k0, 0); + __m128i _w1 = __lsx_vld(k0 + 8, 0); + __m128i _w2 = __lsx_vld(k0 + 16, 0); + __m128i _w3 = __lsx_vld(k0 + 24, 0); + + __m128i _extw0 = __lsx_vslti_h(_w0, 0); + __m128i _extw1 = __lsx_vslti_h(_w1, 0); + __m128i _extw2 = __lsx_vslti_h(_w2, 0); + __m128i _extw3 = __lsx_vslti_h(_w3, 0); + + __m128i _w0l = __lsx_vilvl_h(_extw0, _w0); + __m128i _w0h = __lsx_vilvh_h(_extw0, _w0); + __m128i _w1l = __lsx_vilvl_h(_extw1, _w1); + __m128i _w1h = __lsx_vilvh_h(_extw1, _w1); + __m128i _w2l = __lsx_vilvl_h(_extw2, _w2); + __m128i _w2h = __lsx_vilvh_h(_extw2, _w2); + __m128i _w3l = __lsx_vilvl_h(_extw3, _w3); + __m128i _w3h = __lsx_vilvh_h(_extw3, _w3); + + __m128i _val0_0 = __lsx_vreplgr2vr_w(r0[0]); + __m128i _val0_1 = __lsx_vreplgr2vr_w(r0[1]); + __m128i _val0_2 = __lsx_vreplgr2vr_w(r0[2]); + __m128i _val0_3 = __lsx_vreplgr2vr_w(r0[3]); + __m128i _val0_4 = __lsx_vreplgr2vr_w(r0[4]); + __m128i _val0_5 = __lsx_vreplgr2vr_w(r0[5]); + __m128i _val0_6 = __lsx_vreplgr2vr_w(r0[6]); + __m128i _val0_7 = __lsx_vreplgr2vr_w(r0[7]); + __m128i _val1_0 = __lsx_vreplgr2vr_w(r0[8]); + __m128i _val1_1 = __lsx_vreplgr2vr_w(r0[9]); + __m128i _val1_2 = __lsx_vreplgr2vr_w(r0[10]); + __m128i _val1_3 = __lsx_vreplgr2vr_w(r0[11]); + __m128i _val1_4 = __lsx_vreplgr2vr_w(r0[12]); + __m128i _val1_5 = __lsx_vreplgr2vr_w(r0[13]); + __m128i _val1_6 = __lsx_vreplgr2vr_w(r0[14]); + __m128i _val1_7 = __lsx_vreplgr2vr_w(r0[15]); + + _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0_0); + _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val0_1); + _sum2 = __lsx_vmadd_w(_sum2, _w0l, _val1_0); + _sum3 = __lsx_vmadd_w(_sum3, _w0h, _val1_1); + _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val0_2); + _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val0_3); + _sum2 = __lsx_vmadd_w(_sum2, _w1l, _val1_2); + _sum3 = __lsx_vmadd_w(_sum3, _w1h, _val1_3); + _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val0_4); + _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val0_5); + _sum2 = __lsx_vmadd_w(_sum2, _w2l, _val1_4); + _sum3 = __lsx_vmadd_w(_sum3, _w2h, _val1_5); + _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val0_6); + _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val0_7); + _sum2 = __lsx_vmadd_w(_sum2, _w3l, _val1_6); + _sum3 = __lsx_vmadd_w(_sum3, _w3h, _val1_7); + + r0 += 16; + k0 += 32; + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + + __lsx_vst(_sum0, output0_tm, 0); + __lsx_vst(_sum2, output0_tm + 4, 0); + + output0_tm += 8; + } + for (; i < tiles; i++) + { + const short* r0 = bb2.row(i / 2 + i % 2); + const short* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(k0 + 128); + __m128i _w0 = __lsx_vld(k0, 0); + __m128i _w1 = __lsx_vld(k0 + 8, 0); + __m128i _w2 = __lsx_vld(k0 + 16, 0); + __m128i _w3 = __lsx_vld(k0 + 24, 0); + + __m128i _extw0 = __lsx_vslti_h(_w0, 0); + __m128i _extw1 = __lsx_vslti_h(_w1, 0); + __m128i _extw2 = __lsx_vslti_h(_w2, 0); + __m128i _extw3 = __lsx_vslti_h(_w3, 0); + + __m128i _w0l = __lsx_vilvl_h(_extw0, _w0); + __m128i _w0h = __lsx_vilvh_h(_extw0, _w0); + __m128i _w1l = __lsx_vilvl_h(_extw1, _w1); + __m128i _w1h = __lsx_vilvh_h(_extw1, _w1); + __m128i _w2l = __lsx_vilvl_h(_extw2, _w2); + __m128i _w2h = __lsx_vilvh_h(_extw2, _w2); + __m128i _w3l = __lsx_vilvl_h(_extw3, _w3); + __m128i _w3h = __lsx_vilvh_h(_extw3, _w3); + + __m128i _val0 = __lsx_vreplgr2vr_w(r0[0]); + __m128i _val1 = __lsx_vreplgr2vr_w(r0[1]); + __m128i _val2 = __lsx_vreplgr2vr_w(r0[2]); + __m128i _val3 = __lsx_vreplgr2vr_w(r0[3]); + __m128i _val4 = __lsx_vreplgr2vr_w(r0[4]); + __m128i _val5 = __lsx_vreplgr2vr_w(r0[5]); + __m128i _val6 = __lsx_vreplgr2vr_w(r0[6]); + __m128i _val7 = __lsx_vreplgr2vr_w(r0[7]); + + _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0); + _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val1); + _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val2); + _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val3); + _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val4); + _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val5); + _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val6); + _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val7); + + r0 += 8; + k0 += 32; + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + + __lsx_vst(_sum0, output0_tm, 0); + output0_tm += 4; + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_transform.h b/src/layer/loongarch/convolution_winograd_transform.h new file mode 100644 index 000000000000..624600e95a0d --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_transform.h @@ -0,0 +1,405 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd43_transform_input_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + + const int w_tiles = (w - 2) / 4; + const int h_tiles = (h - 2) / 4; + const int tiles = w_tiles * h_tiles; + + // const float itm[6][6] = { + // {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f}, + // {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f}, + // {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f}, + // {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, 0.0f,-5.0f, 0.0f, 1.0f} + // }; + + // 0 = 4 * r00 - 5 * r02 + r04 + // 1 = -4 * (r01 + r02) + r04 + r03 + // 2 = 4 * (r01 - r02) + r04 - r03 + // 3 = -2 * (r01 - r03) + r04 - r02 + // 4 = 2 * (r01 - r03) + r04 - r02 + // 5 = 4 * r01 - 5 * r03 + r05 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < inch; q++) + { + const Mat img0 = bottom_blob.channel(q); + Mat img0_tm = bottom_blob_tm.channel(q); + + float tmp[6][6]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* r0 = img0.row(i * 4) + (j * 4); + + for (int m = 0; m < 6; m++) + { + float r00 = r0[0]; + float r01 = r0[1]; + float r02 = r0[2]; + float r03 = r0[3]; + float r04 = r0[4]; + float r05 = r0[5]; + + float tmp0m = 4 * r00 - 5 * r02 + r04; + float tmp1m = -4 * (r01 + r02) + r04 + r03; + float tmp2m = 4 * (r01 - r02) + r04 - r03; + float tmp3m = -2 * (r01 - r03) + r04 - r02; + float tmp4m = 2 * (r01 - r03) + r04 - r02; + float tmp5m = 4 * r01 - 5 * r03 + r05; + + tmp[0][m] = tmp0m; + tmp[1][m] = tmp1m; + tmp[2][m] = tmp2m; + tmp[3][m] = tmp3m; + tmp[4][m] = tmp4m; + tmp[5][m] = tmp5m; + + r0 += w; + } + + float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j); + float* r0_tm_1 = r0_tm_0 + tiles; + float* r0_tm_2 = r0_tm_0 + tiles * 2; + float* r0_tm_3 = r0_tm_0 + tiles * 3; + float* r0_tm_4 = r0_tm_0 + tiles * 4; + float* r0_tm_5 = r0_tm_0 + tiles * 5; + + for (int m = 0; m < 6; m++) + { + float tmp00 = tmp[m][0]; + float tmp01 = tmp[m][1]; + float tmp02 = tmp[m][2]; + float tmp03 = tmp[m][3]; + float tmp04 = tmp[m][4]; + float tmp05 = tmp[m][5]; + + float r0tm0 = 4 * tmp00 - 5 * tmp02 + tmp04; + float r0tm1 = -4 * (tmp01 + tmp02) + tmp04 + tmp03; + float r0tm2 = 4 * (tmp01 - tmp02) + tmp04 - tmp03; + float r0tm3 = -2 * (tmp01 - tmp03) + tmp04 - tmp02; + float r0tm4 = 2 * (tmp01 - tmp03) + tmp04 - tmp02; + float r0tm5 = 4 * tmp01 - 5 * tmp03 + tmp05; + + r0_tm_0[0] = r0tm0; + r0_tm_1[0] = r0tm1; + r0_tm_2[0] = r0tm2; + r0_tm_3[0] = r0tm3; + r0_tm_4[0] = r0tm4; + r0_tm_5[0] = r0tm5; + + r0_tm_0 += tiles * 6; + r0_tm_1 += tiles * 6; + r0_tm_2 += tiles * 6; + r0_tm_3 += tiles * 6; + r0_tm_4 += tiles * 6; + r0_tm_5 += tiles * 6; + } + } + } + } +} + +static void conv3x3s1_winograd43_transform_output_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) +{ + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + + const int w_tiles = outw / 4; + const int h_tiles = outh / 4; + const int tiles = w_tiles * h_tiles; + + const float* biasptr = bias; + + // const float otm[4][6] = { + // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f} + // }; + + // 0 = r00 + (r01 + r02) + (r03 + r04) + // 1 = (r01 - r02) + (r03 - r04) * 2 + // 2 = (r01 + r02) + (r03 + r04) * 4 + // 3 = r05 + (r01 - r02) + (r03 - r04) * 8 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + const Mat out0_tm = top_blob_tm.channel(p); + Mat out0 = top_blob.channel(p); + + float bias0 = biasptr ? biasptr[p] : 0.f; + + float tmp[4][6]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j); + const float* output0_tm_1 = output0_tm_0 + tiles; + const float* output0_tm_2 = output0_tm_0 + tiles * 2; + const float* output0_tm_3 = output0_tm_0 + tiles * 3; + const float* output0_tm_4 = output0_tm_0 + tiles * 4; + const float* output0_tm_5 = output0_tm_0 + tiles * 5; + + float* output0 = out0.row(i * 4) + (j * 4); + + for (int m = 0; m < 6; m++) + { + float out0tm0 = output0_tm_0[0]; + float out0tm1 = output0_tm_1[0]; + float out0tm2 = output0_tm_2[0]; + float out0tm3 = output0_tm_3[0]; + float out0tm4 = output0_tm_4[0]; + float out0tm5 = output0_tm_5[0]; + + float tmp02a = out0tm1 + out0tm2; + float tmp13a = out0tm1 - out0tm2; + + float tmp02b = out0tm3 + out0tm4; + float tmp13b = out0tm3 - out0tm4; + + float tmp0m = out0tm0 + tmp02a + tmp02b; + float tmp1m = tmp13a + tmp13b * 2; + float tmp2m = tmp02a + tmp02b * 4; + float tmp3m = out0tm5 + tmp13a + tmp13b * 8; + + tmp[0][m] = tmp0m; + tmp[1][m] = tmp1m; + tmp[2][m] = tmp2m; + tmp[3][m] = tmp3m; + + output0_tm_0 += tiles * 6; + output0_tm_1 += tiles * 6; + output0_tm_2 += tiles * 6; + output0_tm_3 += tiles * 6; + output0_tm_4 += tiles * 6; + output0_tm_5 += tiles * 6; + } + + for (int m = 0; m < 4; m++) + { + float tmp00 = tmp[m][0]; + float tmp01 = tmp[m][1]; + float tmp02 = tmp[m][2]; + float tmp03 = tmp[m][3]; + float tmp04 = tmp[m][4]; + float tmp05 = tmp[m][5]; + + float tmp02a = tmp01 + tmp02; + float tmp13a = tmp01 - tmp02; + + float tmp02b = tmp03 + tmp04; + float tmp13b = tmp03 - tmp04; + + float out00 = bias0 + tmp00 + tmp02a + tmp02b; + float out01 = bias0 + tmp13a + tmp13b * 2; + float out02 = bias0 + tmp02a + tmp02b * 4; + float out03 = bias0 + tmp05 + tmp13a + tmp13b * 8; + + output0[0] = out00; + output0[1] = out01; + output0[2] = out02; + output0[3] = out03; + + output0 += outw; + } + } + } + } +} + +static void conv3x3s1_winograd23_transform_input_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + + const int w_tiles = (w - 2) / 2; + const int h_tiles = (h - 2) / 2; + const int tiles = w_tiles * h_tiles; + + // const float itm[4][4] = { + // {1.0f, 0.0f, -1.0f, 0.0f}, + // {0.0f, 1.0f, 1.00f, 0.0f}, + // {0.0f, -1.0f, 1.00f, 0.0f}, + // {0.0f, -1.0f, 0.00f, 1.0f} + // }; + + // 0 = r00 - r02 + // 1 = r01 + r02 + // 2 = r02 - r01 + // 3 = r03 - r01 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < inch; q++) + { + const Mat img0 = bottom_blob.channel(q); + Mat img0_tm = bottom_blob_tm.channel(q); + + float tmp[4][4]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* r0 = img0.row(i * 2) + (j * 2); + + for (int m = 0; m < 4; m++) + { + float r00 = r0[0]; + float r01 = r0[1]; + float r02 = r0[2]; + float r03 = r0[3]; + + float tmp0m = r00 - r02; + float tmp1m = r01 + r02; + float tmp2m = r02 - r01; + float tmp3m = r03 - r01; + + tmp[0][m] = tmp0m; + tmp[1][m] = tmp1m; + tmp[2][m] = tmp2m; + tmp[3][m] = tmp3m; + + r0 += w; + } + + float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j); + float* r0_tm_1 = r0_tm_0 + tiles; + float* r0_tm_2 = r0_tm_0 + tiles * 2; + float* r0_tm_3 = r0_tm_0 + tiles * 3; + + for (int m = 0; m < 4; m++) + { + float tmp00 = tmp[m][0]; + float tmp01 = tmp[m][1]; + float tmp02 = tmp[m][2]; + float tmp03 = tmp[m][3]; + + float r0tm0 = tmp00 - tmp02; + float r0tm1 = tmp01 + tmp02; + float r0tm2 = tmp02 - tmp01; + float r0tm3 = tmp03 - tmp01; + + r0_tm_0[0] = r0tm0; + r0_tm_1[0] = r0tm1; + r0_tm_2[0] = r0tm2; + r0_tm_3[0] = r0tm3; + + r0_tm_0 += tiles * 4; + r0_tm_1 += tiles * 4; + r0_tm_2 += tiles * 4; + r0_tm_3 += tiles * 4; + } + } + } + } +} + +static void conv3x3s1_winograd23_transform_output_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) +{ + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + + const int w_tiles = outw / 2; + const int h_tiles = outh / 2; + const int tiles = w_tiles * h_tiles; + + const float* biasptr = bias; + + // const float otm[2][4] = { + // {1.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 1.0f} + // }; + + // 0 = r00 + r01 + r02 + // 1 = r01 - r02 + r03 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + const Mat out0_tm = top_blob_tm.channel(p); + Mat out0 = top_blob.channel(p); + + float bias0 = biasptr ? biasptr[p] : 0.f; + + float tmp[2][4]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j); + const float* output0_tm_1 = output0_tm_0 + tiles; + const float* output0_tm_2 = output0_tm_0 + tiles * 2; + const float* output0_tm_3 = output0_tm_0 + tiles * 3; + + float* output0 = out0.row(i * 2) + (j * 2); + + for (int m = 0; m < 4; m++) + { + float out0tm0 = output0_tm_0[0]; + float out0tm1 = output0_tm_1[0]; + float out0tm2 = output0_tm_2[0]; + float out0tm3 = output0_tm_3[0]; + + float tmp0m = out0tm0 + out0tm1 + out0tm2; + float tmp1m = out0tm1 - out0tm2 + out0tm3; + + tmp[0][m] = tmp0m; + tmp[1][m] = tmp1m; + + output0_tm_0 += tiles * 4; + output0_tm_1 += tiles * 4; + output0_tm_2 += tiles * 4; + output0_tm_3 += tiles * 4; + } + + for (int m = 0; m < 2; m++) + { + float tmp00 = tmp[m][0]; + float tmp01 = tmp[m][1]; + float tmp02 = tmp[m][2]; + float tmp03 = tmp[m][3]; + + float out00 = bias0 + tmp00 + tmp01 + tmp02; + float out01 = bias0 + tmp01 - tmp02 + tmp03; + + output0[0] = out00; + output0[1] = out01; + + output0 += outw; + } + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_transform_int8.h b/src/layer/loongarch/convolution_winograd_transform_int8.h new file mode 100644 index 000000000000..09ef669e4733 --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_transform_int8.h @@ -0,0 +1,229 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd43_transform_input_int8_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + + const int w_tiles = (w - 2) / 4; + const int h_tiles = (h - 2) / 4; + const int tiles = w_tiles * h_tiles; + + // const float itm[6][6] = { + // {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f}, + // {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f}, + // {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f}, + // {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, 0.0f,-5.0f, 0.0f, 1.0f} + // }; + + // 0 = 4 * r00 - 5 * r02 + r04 + // 1 = -4 * (r01 + r02) + r04 + r03 + // 2 = 4 * (r01 - r02) + r04 - r03 + // 3 = -2 * (r01 - r03) + r04 - r02 + // 4 = 2 * (r01 - r03) + r04 - r02 + // 5 = 4 * r01 - 5 * r03 + r05 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < inch; q++) + { + const Mat img0 = bottom_blob.channel(q); + Mat img0_tm = bottom_blob_tm.channel(q); + + short tmp[6][6]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const signed char* r0 = img0.row(i * 4) + (j * 4); + + for (int m = 0; m < 6; m++) + { + signed char r00 = r0[0]; + signed char r01 = r0[1]; + signed char r02 = r0[2]; + signed char r03 = r0[3]; + signed char r04 = r0[4]; + signed char r05 = r0[5]; + + short tmp0m = 4 * r00 - 5 * r02 + r04; + short tmp1m = -4 * (r01 + r02) + r04 + r03; + short tmp2m = 4 * (r01 - r02) + r04 - r03; + short tmp3m = -2 * (r01 - r03) + r04 - r02; + short tmp4m = 2 * (r01 - r03) + r04 - r02; + short tmp5m = 4 * r01 - 5 * r03 + r05; + + tmp[0][m] = tmp0m; + tmp[1][m] = tmp1m; + tmp[2][m] = tmp2m; + tmp[3][m] = tmp3m; + tmp[4][m] = tmp4m; + tmp[5][m] = tmp5m; + + r0 += w; + } + + short* r0_tm_0 = (short*)img0_tm + (i * w_tiles + j); + short* r0_tm_1 = r0_tm_0 + tiles; + short* r0_tm_2 = r0_tm_0 + tiles * 2; + short* r0_tm_3 = r0_tm_0 + tiles * 3; + short* r0_tm_4 = r0_tm_0 + tiles * 4; + short* r0_tm_5 = r0_tm_0 + tiles * 5; + + for (int m = 0; m < 6; m++) + { + short tmp00 = tmp[m][0]; + short tmp01 = tmp[m][1]; + short tmp02 = tmp[m][2]; + short tmp03 = tmp[m][3]; + short tmp04 = tmp[m][4]; + short tmp05 = tmp[m][5]; + + short r0tm0 = 4 * tmp00 - 5 * tmp02 + tmp04; + short r0tm1 = -4 * (tmp01 + tmp02) + tmp04 + tmp03; + short r0tm2 = 4 * (tmp01 - tmp02) + tmp04 - tmp03; + short r0tm3 = -2 * (tmp01 - tmp03) + tmp04 - tmp02; + short r0tm4 = 2 * (tmp01 - tmp03) + tmp04 - tmp02; + short r0tm5 = 4 * tmp01 - 5 * tmp03 + tmp05; + + r0_tm_0[0] = r0tm0; + r0_tm_1[0] = r0tm1; + r0_tm_2[0] = r0tm2; + r0_tm_3[0] = r0tm3; + r0_tm_4[0] = r0tm4; + r0_tm_5[0] = r0tm5; + + r0_tm_0 += tiles * 6; + r0_tm_1 += tiles * 6; + r0_tm_2 += tiles * 6; + r0_tm_3 += tiles * 6; + r0_tm_4 += tiles * 6; + r0_tm_5 += tiles * 6; + } + } + } + } +} + +static void conv3x3s1_winograd43_transform_output_int8_lsx(const Mat& top_blob_tm, Mat& top_blob, const Option& opt) +{ + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + + const int w_tiles = outw / 4; + const int h_tiles = outh / 4; + const int tiles = w_tiles * h_tiles; + + // const float otm[4][6] = { + // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f} + // }; + + // 0 = r00 + (r01 + r02) + (r03 + r04) + // 1 = (r01 - r02) + (r03 - r04) * 2 + // 2 = (r01 + r02) + (r03 + r04) * 4 + // 3 = r05 + (r01 - r02) + (r03 - r04) * 8 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + const Mat out0_tm = top_blob_tm.channel(p); + Mat out0 = top_blob.channel(p); + + int tmp[4][6]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const int* output0_tm_0 = (const int*)out0_tm + (i * w_tiles + j) * 1; + const int* output0_tm_1 = output0_tm_0 + tiles * 1; + const int* output0_tm_2 = output0_tm_0 + tiles * 2; + const int* output0_tm_3 = output0_tm_0 + tiles * 3; + const int* output0_tm_4 = output0_tm_0 + tiles * 4; + const int* output0_tm_5 = output0_tm_0 + tiles * 5; + + int* output0 = out0.row(i * 4) + j * 4; + + for (int m = 0; m < 5; m++) + { + int tmp02a = output0_tm_1[0] + output0_tm_2[0]; + int tmp13a = output0_tm_1[0] - output0_tm_2[0]; + + int tmp02b = output0_tm_3[0] + output0_tm_4[0]; + int tmp13b = output0_tm_3[0] - output0_tm_4[0]; + + tmp[0][m] = output0_tm_0[0] + tmp02a + tmp02b; + tmp[1][m] = tmp13a + tmp13b * 2; + tmp[2][m] = tmp02a + tmp02b * 4; + tmp[3][m] = output0_tm_5[0] * 4 + tmp13a + tmp13b * 8; + + output0_tm_0 += tiles * 6; + output0_tm_1 += tiles * 6; + output0_tm_2 += tiles * 6; + output0_tm_3 += tiles * 6; + output0_tm_4 += tiles * 6; + output0_tm_5 += tiles * 6; + } + for (int m = 5; m < 6; m++) + { + int tmp02a = output0_tm_1[0] + output0_tm_2[0]; + int tmp13a = output0_tm_1[0] - output0_tm_2[0]; + + int tmp02b = output0_tm_3[0] + output0_tm_4[0]; + int tmp13b = output0_tm_3[0] - output0_tm_4[0]; + + tmp[0][m] = (output0_tm_0[0] + tmp02a + tmp02b) * 4; + tmp[1][m] = (tmp13a + tmp13b * 2) * 4; + tmp[2][m] = (tmp02a + tmp02b * 4) * 4; + tmp[3][m] = (output0_tm_5[0] * 4 + tmp13a + tmp13b * 8) * 4; + + output0_tm_0 += tiles * 6; + output0_tm_1 += tiles * 6; + output0_tm_2 += tiles * 6; + output0_tm_3 += tiles * 6; + output0_tm_4 += tiles * 6; + output0_tm_5 += tiles * 6; + } + + for (int m = 0; m < 4; m++) + { + const int* tmp0 = tmp[m]; + + int tmp02a = tmp0[1] + tmp0[2]; + int tmp13a = tmp0[1] - tmp0[2]; + + int tmp02b = tmp0[3] + tmp0[4]; + int tmp13b = tmp0[3] - tmp0[4]; + + output0[0] = (tmp0[0] + tmp02a + tmp02b) / 576; + output0[1] = (tmp13a + tmp13b * 2) / 576; + output0[2] = (tmp02a + tmp02b * 4) / 576; + output0[3] = (tmp0[5] + tmp13a + tmp13b * 8) / 576; + + output0 += outw; + } + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_transform_pack4.h b/src/layer/loongarch/convolution_winograd_transform_pack4.h new file mode 100644 index 000000000000..3969e59cf09c --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_transform_pack4.h @@ -0,0 +1,730 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd63_transform_input_pack4_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + + const int w_tiles = (w - 2) / 6; + const int h_tiles = (h - 2) / 6; + const int tiles = w_tiles * h_tiles; + + // const float itm[8][8] = { + // {1.0f, 0.0f, -5.25f, 0.00f, 5.25f, 0.00f, -1.0f, 0.0f}, + // + // {0.0f, 1.0f, 1.00f, -4.25f, -4.25f, 1.00f, 1.0f, 0.0f}, + // {0.0f, -1.0f, 1.00f, 4.25f, -4.25f, -1.00f, 1.0f, 0.0f}, + // + // {0.0f, 0.5f, 0.25f, -2.50f, -1.25f, 2.00f, 1.0f, 0.0f}, + // {0.0f, -0.5f, 0.25f, 2.50f, -1.25f, -2.00f, 1.0f, 0.0f}, + // + // {0.0f, 2.0f, 4.00f, -2.50f, -5.00f, 0.50f, 1.0f, 0.0f}, + // {0.0f, -2.0f, 4.00f, 2.50f, -5.00f, -0.50f, 1.0f, 0.0f}, + // + // {0.0f, -1.0f, 0.00f, 5.25f, 0.00f, -5.25f, 0.0f, 1.0f} + // }; + + // 0 = r00 - r06 + (r04 - r02) * 5.25 + // 7 = r07 - r01 + (r03 - r05) * 5.25 + + // 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05) + // 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05) + + // 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2) + // 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2) + + // reuse r04 * 1.25 + // reuse r03 * 2.5 + // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5) + // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5) + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < inch; q++) + { + const Mat img0 = bottom_blob.channel(q); + Mat img0_tm = bottom_blob_tm.channel(q); + + float tmp[8][8][4]; + + __m128 _v5_25 = __lsx_vreplfr2vr_s(5.25f); + __m128 _vm4_25 = __lsx_vreplfr2vr_s(-4.25f); + __m128 _vm1_25 = __lsx_vreplfr2vr_s(-1.25f); + __m128 _v0_25 = __lsx_vreplfr2vr_s(0.25f); + __m128 _vm2_5 = __lsx_vreplfr2vr_s(-2.5f); + __m128 _v0_5 = __lsx_vreplfr2vr_s(0.5f); + __m128 _v2 = __lsx_vreplfr2vr_s(2.f); + __m128 _v4 = __lsx_vreplfr2vr_s(4.f); + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* r0 = img0.row(i * 6) + (j * 6) * 4; + + for (int m = 0; m < 8; m++) + { + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0); + __m128 _r05 = (__m128)__lsx_vld(r0 + 4 * 5, 0); + __m128 _r06 = (__m128)__lsx_vld(r0 + 4 * 6, 0); + __m128 _r07 = (__m128)__lsx_vld(r0 + 4 * 7, 0); + + __m128 _tmp0m = __lsx_vfmadd_s(__lsx_vfsub_s(_r04, _r02), _v5_25, __lsx_vfsub_s(_r00, _r06)); + __m128 _tmp7m = __lsx_vfmadd_s(__lsx_vfsub_s(_r03, _r05), _v5_25, __lsx_vfsub_s(_r07, _r01)); + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp7m, tmp[7][m], 0); + + __m128 _tmp12a = __lsx_vfmadd_s(_r04, _vm4_25, __lsx_vfadd_s(_r02, _r06)); + __m128 _tmp12b = __lsx_vfmadd_s(_r03, _vm4_25, __lsx_vfadd_s(_r01, _r05)); + + __m128 _tmp1m = __lsx_vfadd_s(_tmp12a, _tmp12b); + __m128 _tmp2m = __lsx_vfsub_s(_tmp12a, _tmp12b); + __lsx_vst(_tmp1m, tmp[1][m], 0); + __lsx_vst(_tmp2m, tmp[2][m], 0); + + __m128 _tmp34a = __lsx_vfmadd_s(_r04, _vm1_25, __lsx_vfmadd_s(_r02, _v0_25, _r06)); + __m128 _tmp34b = __lsx_vfmadd_s(_r05, _v2, __lsx_vfmadd_s(_r03, _vm2_5, __lsx_vfmul_s(_r01, _v0_5))); + + __m128 _tmp3m = __lsx_vfadd_s(_tmp34a, _tmp34b); + __m128 _tmp4m = __lsx_vfsub_s(_tmp34a, _tmp34b); + __lsx_vst(_tmp3m, tmp[3][m], 0); + __lsx_vst(_tmp4m, tmp[4][m], 0); + + __m128 _tmp56a = __lsx_vfmadd_s(__lsx_vfmadd_s(_r04, _vm1_25, _r02), _v4, _r06); + __m128 _tmp56b = __lsx_vfmadd_s(_r05, _v0_5, __lsx_vfmadd_s(_r03, _vm2_5, __lsx_vfmul_s(_r01, _v2))); + + __m128 _tmp5m = __lsx_vfadd_s(_tmp56a, _tmp56b); + __m128 _tmp6m = __lsx_vfsub_s(_tmp56a, _tmp56b); + __lsx_vst(_tmp5m, tmp[5][m], 0); + __lsx_vst(_tmp6m, tmp[6][m], 0); + + r0 += w * 4; + } + + float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j) * 4; + float* r0_tm_1 = r0_tm_0 + tiles * 4; + float* r0_tm_2 = r0_tm_0 + tiles * 4 * 2; + float* r0_tm_3 = r0_tm_0 + tiles * 4 * 3; + float* r0_tm_4 = r0_tm_0 + tiles * 4 * 4; + float* r0_tm_5 = r0_tm_0 + tiles * 4 * 5; + float* r0_tm_6 = r0_tm_0 + tiles * 4 * 6; + float* r0_tm_7 = r0_tm_0 + tiles * 4 * 7; + + for (int m = 0; m < 8; m++) + { + __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0); + __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0); + __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0); + __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0); + __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0); + __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0); + __m128 _tmp06 = (__m128)__lsx_vld(tmp[m][6], 0); + __m128 _tmp07 = (__m128)__lsx_vld(tmp[m][7], 0); + + __m128 _r0tm0 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp04, _tmp02), _v5_25, __lsx_vfsub_s(_tmp00, _tmp06)); + __m128 _r0tm7 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp03, _tmp05), _v5_25, __lsx_vfsub_s(_tmp07, _tmp01)); + + __m128 _tmp12a = __lsx_vfmadd_s(_tmp04, _vm4_25, __lsx_vfadd_s(_tmp02, _tmp06)); + __m128 _tmp12b = __lsx_vfmadd_s(_tmp03, _vm4_25, __lsx_vfadd_s(_tmp01, _tmp05)); + + __m128 _r0tm1 = __lsx_vfadd_s(_tmp12a, _tmp12b); + __m128 _r0tm2 = __lsx_vfsub_s(_tmp12a, _tmp12b); + + __m128 _tmp34a = __lsx_vfmadd_s(_tmp04, _vm1_25, __lsx_vfmadd_s(_tmp02, _v0_25, _tmp06)); + __m128 _tmp34b = __lsx_vfmadd_s(_tmp05, _v2, __lsx_vfmadd_s(_tmp03, _vm2_5, __lsx_vfmul_s(_tmp01, _v0_5))); + + __m128 _r0tm3 = __lsx_vfadd_s(_tmp34a, _tmp34b); + __m128 _r0tm4 = __lsx_vfsub_s(_tmp34a, _tmp34b); + + __m128 _tmp56a = __lsx_vfmadd_s(__lsx_vfmadd_s(_tmp04, _vm1_25, _tmp02), _v4, _tmp06); + __m128 _tmp56b = __lsx_vfmadd_s(_tmp05, _v0_5, __lsx_vfmadd_s(_tmp03, _vm2_5, __lsx_vfmul_s(_tmp01, _v2))); + + __m128 _r0tm5 = __lsx_vfadd_s(_tmp56a, _tmp56b); + __m128 _r0tm6 = __lsx_vfsub_s(_tmp56a, _tmp56b); + + __lsx_vst(_r0tm0, r0_tm_0, 0); + __lsx_vst(_r0tm1, r0_tm_1, 0); + __lsx_vst(_r0tm2, r0_tm_2, 0); + __lsx_vst(_r0tm3, r0_tm_3, 0); + __lsx_vst(_r0tm4, r0_tm_4, 0); + __lsx_vst(_r0tm5, r0_tm_5, 0); + __lsx_vst(_r0tm6, r0_tm_6, 0); + __lsx_vst(_r0tm7, r0_tm_7, 0); + + r0_tm_0 += tiles * 4 * 8; + r0_tm_1 += tiles * 4 * 8; + r0_tm_2 += tiles * 4 * 8; + r0_tm_3 += tiles * 4 * 8; + r0_tm_4 += tiles * 4 * 8; + r0_tm_5 += tiles * 4 * 8; + r0_tm_6 += tiles * 4 * 8; + r0_tm_7 += tiles * 4 * 8; + } + } + } + } +} + +static void conv3x3s1_winograd63_transform_output_pack4_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) +{ + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + + const int w_tiles = outw / 6; + const int h_tiles = outh / 6; + const int tiles = w_tiles * h_tiles; + + const float* biasptr = bias; + + // const float otm[6][8] = { + // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 32.0f, 32.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 16.0f,-16.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 8.0f, 8.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 4.0f, -4.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 16.0f, 16.0f, 2.0f, 2.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 32.0f, -32.0f, 1.0f, -1.0f, 1.0f} + // }; + + // 0 = r0 + (r1 + r2) + (r3 + r4) + (r5 + r6) * 32 + // 1 = (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16 + // 2 = (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8 + // 3 = (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4 + // 4 = (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2 + // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6) + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + const Mat out0_tm = top_blob_tm.channel(p); + Mat out0 = top_blob.channel(p); + + __m128 _bias0 = biasptr ? (__m128)__lsx_vld(biasptr + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + + float tmp[6][8][4]; + + __m128 _v32 = __lsx_vreplfr2vr_s(32.f); + __m128 _v16 = __lsx_vreplfr2vr_s(16.f); + __m128 _v8 = __lsx_vreplfr2vr_s(8.f); + __m128 _v4 = __lsx_vreplfr2vr_s(4.f); + __m128 _v2 = __lsx_vreplfr2vr_s(2.f); + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j) * 4; + const float* output0_tm_1 = output0_tm_0 + tiles * 4; + const float* output0_tm_2 = output0_tm_0 + tiles * 4 * 2; + const float* output0_tm_3 = output0_tm_0 + tiles * 4 * 3; + const float* output0_tm_4 = output0_tm_0 + tiles * 4 * 4; + const float* output0_tm_5 = output0_tm_0 + tiles * 4 * 5; + const float* output0_tm_6 = output0_tm_0 + tiles * 4 * 6; + const float* output0_tm_7 = output0_tm_0 + tiles * 4 * 7; + + float* output0 = out0.row(i * 6) + (j * 6) * 4; + + for (int m = 0; m < 8; m++) + { + __m128 _out0tm0 = (__m128)__lsx_vld(output0_tm_0, 0); + __m128 _out0tm1 = (__m128)__lsx_vld(output0_tm_1, 0); + __m128 _out0tm2 = (__m128)__lsx_vld(output0_tm_2, 0); + __m128 _out0tm3 = (__m128)__lsx_vld(output0_tm_3, 0); + __m128 _out0tm4 = (__m128)__lsx_vld(output0_tm_4, 0); + __m128 _out0tm5 = (__m128)__lsx_vld(output0_tm_5, 0); + __m128 _out0tm6 = (__m128)__lsx_vld(output0_tm_6, 0); + __m128 _out0tm7 = (__m128)__lsx_vld(output0_tm_7, 0); + + __m128 _tmp024a = __lsx_vfadd_s(_out0tm1, _out0tm2); + __m128 _tmp135a = __lsx_vfsub_s(_out0tm1, _out0tm2); + + __m128 _tmp024b = __lsx_vfadd_s(_out0tm3, _out0tm4); + __m128 _tmp135b = __lsx_vfsub_s(_out0tm3, _out0tm4); + + __m128 _tmp024c = __lsx_vfadd_s(_out0tm5, _out0tm6); + __m128 _tmp135c = __lsx_vfsub_s(_out0tm5, _out0tm6); + + __m128 _tmp0m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm0, _tmp024a), __lsx_vfmadd_s(_tmp024c, _v32, _tmp024b)); + __m128 _tmp2m = __lsx_vfmadd_s(_tmp024c, _v8, __lsx_vfmadd_s(_tmp024b, _v4, _tmp024a)); + __m128 _tmp4m = __lsx_vfmadd_s(_tmp024c, _v2, __lsx_vfmadd_s(_tmp024b, _v16, _tmp024a)); + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp2m, tmp[2][m], 0); + __lsx_vst(_tmp4m, tmp[4][m], 0); + + __m128 _tmp1m = __lsx_vfmadd_s(_tmp135c, _v16, __lsx_vfmadd_s(_tmp135b, _v2, _tmp135a)); + __m128 _tmp3m = __lsx_vfmadd_s(_tmp135c, _v4, __lsx_vfmadd_s(_tmp135b, _v8, _tmp135a)); + __m128 _tmp5m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm7, _tmp135a), __lsx_vfmadd_s(_tmp135b, _v32, _tmp135c)); + __lsx_vst(_tmp1m, tmp[1][m], 0); + __lsx_vst(_tmp3m, tmp[3][m], 0); + __lsx_vst(_tmp5m, tmp[5][m], 0); + + output0_tm_0 += tiles * 4 * 8; + output0_tm_1 += tiles * 4 * 8; + output0_tm_2 += tiles * 4 * 8; + output0_tm_3 += tiles * 4 * 8; + output0_tm_4 += tiles * 4 * 8; + output0_tm_5 += tiles * 4 * 8; + output0_tm_6 += tiles * 4 * 8; + output0_tm_7 += tiles * 4 * 8; + } + + for (int m = 0; m < 6; m++) + { + __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0); + __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0); + __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0); + __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0); + __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0); + __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0); + __m128 _tmp06 = (__m128)__lsx_vld(tmp[m][6], 0); + __m128 _tmp07 = (__m128)__lsx_vld(tmp[m][7], 0); + + __m128 _tmp024a = __lsx_vfadd_s(_tmp01, _tmp02); + __m128 _tmp135a = __lsx_vfsub_s(_tmp01, _tmp02); + + __m128 _tmp024b = __lsx_vfadd_s(_tmp03, _tmp04); + __m128 _tmp135b = __lsx_vfsub_s(_tmp03, _tmp04); + + __m128 _tmp024c = __lsx_vfadd_s(_tmp05, _tmp06); + __m128 _tmp135c = __lsx_vfsub_s(_tmp05, _tmp06); + + __m128 _out00 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp00, _tmp024a), __lsx_vfmadd_s(_tmp024c, _v32, _tmp024b))); + __m128 _out02 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp024c, _v8, __lsx_vfmadd_s(_tmp024b, _v4, _tmp024a))); + __m128 _out04 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp024c, _v2, __lsx_vfmadd_s(_tmp024b, _v16, _tmp024a))); + __lsx_vst(_out00, output0, 0); + __lsx_vst(_out02, output0 + 4 * 2, 0); + __lsx_vst(_out04, output0 + 4 * 4, 0); + + __m128 _out01 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp135c, _v16, __lsx_vfmadd_s(_tmp135b, _v2, _tmp135a))); + __m128 _out03 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp135c, _v4, __lsx_vfmadd_s(_tmp135b, _v8, _tmp135a))); + __m128 _out05 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp07, _tmp135a), __lsx_vfmadd_s(_tmp135b, _v32, _tmp135c))); + __lsx_vst(_out01, output0 + 4, 0); + __lsx_vst(_out03, output0 + 4 * 3, 0); + __lsx_vst(_out05, output0 + 4 * 5, 0); + + output0 += outw * 4; + } + } + } + } +} + +static void conv3x3s1_winograd43_transform_input_pack4_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + + const int w_tiles = (w - 2) / 4; + const int h_tiles = (h - 2) / 4; + const int tiles = w_tiles * h_tiles; + + // const float itm[6][6] = { + // {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f}, + // {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f}, + // {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f}, + // {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, 0.0f,-5.0f, 0.0f, 1.0f} + // }; + + // 0 = 4 * r00 - 5 * r02 + r04 + // 1 = -4 * (r01 + r02) + r04 + r03 + // 2 = 4 * (r01 - r02) + r04 - r03 + // 3 = -2 * (r01 - r03) + r04 - r02 + // 4 = 2 * (r01 - r03) + r04 - r02 + // 5 = 4 * r01 - 5 * r03 + r05 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < inch; q++) + { + const Mat img0 = bottom_blob.channel(q); + Mat img0_tm = bottom_blob_tm.channel(q); + + float tmp[6][6][4]; + + __m128 _vm5 = __lsx_vreplfr2vr_s(-5.f); + __m128 _vm4 = __lsx_vreplfr2vr_s(-4.f); + __m128 _v4 = __lsx_vreplfr2vr_s(4.f); + __m128 _vm2 = __lsx_vreplfr2vr_s(-2.f); + __m128 _v2 = __lsx_vreplfr2vr_s(2.f); + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* r0 = img0.row(i * 4) + (j * 4) * 4; + + for (int m = 0; m < 6; m++) + { + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0); + __m128 _r05 = (__m128)__lsx_vld(r0 + 4 * 5, 0); + + __m128 _tmp0m = __lsx_vfmadd_s(_r02, _vm5, __lsx_vfmadd_s(_r00, _v4, _r04)); + __m128 _tmp1m = __lsx_vfmadd_s(__lsx_vfadd_s(_r01, _r02), _vm4, __lsx_vfadd_s(_r04, _r03)); + __m128 _tmp2m = __lsx_vfmadd_s(__lsx_vfsub_s(_r01, _r02), _v4, __lsx_vfsub_s(_r04, _r03)); + __m128 _tmp3m = __lsx_vfmadd_s(__lsx_vfsub_s(_r01, _r03), _vm2, __lsx_vfsub_s(_r04, _r02)); + __m128 _tmp4m = __lsx_vfmadd_s(__lsx_vfsub_s(_r01, _r03), _v2, __lsx_vfsub_s(_r04, _r02)); + __m128 _tmp5m = __lsx_vfmadd_s(_r03, _vm5, __lsx_vfmadd_s(_r01, _v4, _r05)); + + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp1m, tmp[1][m], 0); + __lsx_vst(_tmp2m, tmp[2][m], 0); + __lsx_vst(_tmp3m, tmp[3][m], 0); + __lsx_vst(_tmp4m, tmp[4][m], 0); + __lsx_vst(_tmp5m, tmp[5][m], 0); + + r0 += w * 4; + } + + float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j) * 4; + float* r0_tm_1 = r0_tm_0 + tiles * 4; + float* r0_tm_2 = r0_tm_0 + tiles * 4 * 2; + float* r0_tm_3 = r0_tm_0 + tiles * 4 * 3; + float* r0_tm_4 = r0_tm_0 + tiles * 4 * 4; + float* r0_tm_5 = r0_tm_0 + tiles * 4 * 5; + + for (int m = 0; m < 6; m++) + { + __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0); + __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0); + __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0); + __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0); + __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0); + __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0); + + __m128 _r0tm0 = __lsx_vfmadd_s(_tmp02, _vm5, __lsx_vfmadd_s(_tmp00, _v4, _tmp04)); + __m128 _r0tm1 = __lsx_vfmadd_s(__lsx_vfadd_s(_tmp01, _tmp02), _vm4, __lsx_vfadd_s(_tmp04, _tmp03)); + __m128 _r0tm2 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp01, _tmp02), _v4, __lsx_vfsub_s(_tmp04, _tmp03)); + __m128 _r0tm3 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp01, _tmp03), _vm2, __lsx_vfsub_s(_tmp04, _tmp02)); + __m128 _r0tm4 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp01, _tmp03), _v2, __lsx_vfsub_s(_tmp04, _tmp02)); + __m128 _r0tm5 = __lsx_vfmadd_s(_tmp03, _vm5, __lsx_vfmadd_s(_tmp01, _v4, _tmp05)); + + __lsx_vst(_r0tm0, r0_tm_0, 0); + __lsx_vst(_r0tm1, r0_tm_1, 0); + __lsx_vst(_r0tm2, r0_tm_2, 0); + __lsx_vst(_r0tm3, r0_tm_3, 0); + __lsx_vst(_r0tm4, r0_tm_4, 0); + __lsx_vst(_r0tm5, r0_tm_5, 0); + + r0_tm_0 += tiles * 4 * 6; + r0_tm_1 += tiles * 4 * 6; + r0_tm_2 += tiles * 4 * 6; + r0_tm_3 += tiles * 4 * 6; + r0_tm_4 += tiles * 4 * 6; + r0_tm_5 += tiles * 4 * 6; + } + } + } + } +} + +static void conv3x3s1_winograd43_transform_output_pack4_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) +{ + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + + const int w_tiles = outw / 4; + const int h_tiles = outh / 4; + const int tiles = w_tiles * h_tiles; + + const float* biasptr = bias; + + // const float otm[4][6] = { + // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f} + // }; + + // 0 = r00 + (r01 + r02) + (r03 + r04) + // 1 = (r01 - r02) + (r03 - r04) * 2 + // 2 = (r01 + r02) + (r03 + r04) * 4 + // 3 = r05 + (r01 - r02) + (r03 - r04) * 8 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + const Mat out0_tm = top_blob_tm.channel(p); + Mat out0 = top_blob.channel(p); + + __m128 _bias0 = biasptr ? (__m128)__lsx_vld(biasptr + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + + float tmp[4][6][4]; + + __m128 _v2 = __lsx_vreplfr2vr_s(2.f); + __m128 _v4 = __lsx_vreplfr2vr_s(4.f); + __m128 _v8 = __lsx_vreplfr2vr_s(8.f); + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j) * 4; + const float* output0_tm_1 = output0_tm_0 + tiles * 4; + const float* output0_tm_2 = output0_tm_0 + tiles * 4 * 2; + const float* output0_tm_3 = output0_tm_0 + tiles * 4 * 3; + const float* output0_tm_4 = output0_tm_0 + tiles * 4 * 4; + const float* output0_tm_5 = output0_tm_0 + tiles * 4 * 5; + + float* output0 = out0.row(i * 4) + (j * 4) * 4; + + for (int m = 0; m < 6; m++) + { + __m128 _out0tm0 = (__m128)__lsx_vld(output0_tm_0, 0); + __m128 _out0tm1 = (__m128)__lsx_vld(output0_tm_1, 0); + __m128 _out0tm2 = (__m128)__lsx_vld(output0_tm_2, 0); + __m128 _out0tm3 = (__m128)__lsx_vld(output0_tm_3, 0); + __m128 _out0tm4 = (__m128)__lsx_vld(output0_tm_4, 0); + __m128 _out0tm5 = (__m128)__lsx_vld(output0_tm_5, 0); + + __m128 _tmp02a = __lsx_vfadd_s(_out0tm1, _out0tm2); + __m128 _tmp13a = __lsx_vfsub_s(_out0tm1, _out0tm2); + + __m128 _tmp02b = __lsx_vfadd_s(_out0tm3, _out0tm4); + __m128 _tmp13b = __lsx_vfsub_s(_out0tm3, _out0tm4); + + __m128 _tmp0m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm0, _tmp02a), _tmp02b); + __m128 _tmp1m = __lsx_vfmadd_s(_tmp13b, _v2, _tmp13a); + __m128 _tmp2m = __lsx_vfmadd_s(_tmp02b, _v4, _tmp02a); + __m128 _tmp3m = __lsx_vfmadd_s(_tmp13b, _v8, __lsx_vfadd_s(_out0tm5, _tmp13a)); + + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp1m, tmp[1][m], 0); + __lsx_vst(_tmp2m, tmp[2][m], 0); + __lsx_vst(_tmp3m, tmp[3][m], 0); + + output0_tm_0 += tiles * 4 * 6; + output0_tm_1 += tiles * 4 * 6; + output0_tm_2 += tiles * 4 * 6; + output0_tm_3 += tiles * 4 * 6; + output0_tm_4 += tiles * 4 * 6; + output0_tm_5 += tiles * 4 * 6; + } + + for (int m = 0; m < 4; m++) + { + __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0); + __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0); + __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0); + __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0); + __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0); + __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0); + + __m128 _tmp02a = __lsx_vfadd_s(_tmp01, _tmp02); + __m128 _tmp13a = __lsx_vfsub_s(_tmp01, _tmp02); + + __m128 _tmp02b = __lsx_vfadd_s(_tmp03, _tmp04); + __m128 _tmp13b = __lsx_vfsub_s(_tmp03, _tmp04); + + __m128 _out00 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp00, _tmp02a), _tmp02b)); + __m128 _out01 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp13b, _v2, _tmp13a)); + __m128 _out02 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp02b, _v4, _tmp02a)); + __m128 _out03 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp13b, _v8, __lsx_vfadd_s(_tmp05, _tmp13a))); + + __lsx_vst(_out00, output0, 0); + __lsx_vst(_out01, output0 + 4, 0); + __lsx_vst(_out02, output0 + 4 * 2, 0); + __lsx_vst(_out03, output0 + 4 * 3, 0); + + output0 += outw * 4; + } + } + } + } +} + +static void conv3x3s1_winograd23_transform_input_pack4_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + + const int w_tiles = (w - 2) / 2; + const int h_tiles = (h - 2) / 2; + const int tiles = w_tiles * h_tiles; + + // const float itm[4][4] = { + // {1.0f, 0.0f, -1.0f, 0.0f}, + // {0.0f, 1.0f, 1.00f, 0.0f}, + // {0.0f, -1.0f, 1.00f, 0.0f}, + // {0.0f, -1.0f, 0.00f, 1.0f} + // }; + + // 0 = r00 - r02 + // 1 = r01 + r02 + // 2 = r02 - r01 + // 3 = r03 - r01 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < inch; q++) + { + const Mat img0 = bottom_blob.channel(q); + Mat img0_tm = bottom_blob_tm.channel(q); + + float tmp[4][4][4]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* r0 = img0.row(i * 2) + (j * 2) * 4; + + for (int m = 0; m < 4; m++) + { + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + + __m128 _tmp0m = __lsx_vfsub_s(_r00, _r02); + __m128 _tmp1m = __lsx_vfadd_s(_r01, _r02); + __m128 _tmp2m = __lsx_vfsub_s(_r02, _r01); + __m128 _tmp3m = __lsx_vfsub_s(_r03, _r01); + + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp1m, tmp[1][m], 0); + __lsx_vst(_tmp2m, tmp[2][m], 0); + __lsx_vst(_tmp3m, tmp[3][m], 0); + + r0 += w * 4; + } + + float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j) * 4; + float* r0_tm_1 = r0_tm_0 + tiles * 4; + float* r0_tm_2 = r0_tm_0 + tiles * 4 * 2; + float* r0_tm_3 = r0_tm_0 + tiles * 4 * 3; + + for (int m = 0; m < 4; m++) + { + __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0); + __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0); + __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0); + __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0); + + __m128 _r0tm0 = __lsx_vfsub_s(_tmp00, _tmp02); + __m128 _r0tm1 = __lsx_vfadd_s(_tmp01, _tmp02); + __m128 _r0tm2 = __lsx_vfsub_s(_tmp02, _tmp01); + __m128 _r0tm3 = __lsx_vfsub_s(_tmp03, _tmp01); + + __lsx_vst(_r0tm0, r0_tm_0, 0); + __lsx_vst(_r0tm1, r0_tm_1, 0); + __lsx_vst(_r0tm2, r0_tm_2, 0); + __lsx_vst(_r0tm3, r0_tm_3, 0); + + r0_tm_0 += tiles * 4 * 4; + r0_tm_1 += tiles * 4 * 4; + r0_tm_2 += tiles * 4 * 4; + r0_tm_3 += tiles * 4 * 4; + } + } + } + } +} + +static void conv3x3s1_winograd23_transform_output_pack4_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) +{ + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + + const int w_tiles = outw / 2; + const int h_tiles = outh / 2; + const int tiles = w_tiles * h_tiles; + + const float* biasptr = bias; + + // const float otm[2][4] = { + // {1.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 1.0f} + // }; + + // 0 = r00 + r01 + r02 + // 1 = r01 - r02 + r03 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + const Mat out0_tm = top_blob_tm.channel(p); + Mat out0 = top_blob.channel(p); + + __m128 _bias0 = biasptr ? (__m128)__lsx_vld(biasptr + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + + float tmp[2][4][4]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j) * 4; + const float* output0_tm_1 = output0_tm_0 + tiles * 4; + const float* output0_tm_2 = output0_tm_0 + tiles * 4 * 2; + const float* output0_tm_3 = output0_tm_0 + tiles * 4 * 3; + + float* output0 = out0.row(i * 2) + (j * 2) * 4; + + for (int m = 0; m < 4; m++) + { + __m128 _out0tm0 = (__m128)__lsx_vld(output0_tm_0, 0); + __m128 _out0tm1 = (__m128)__lsx_vld(output0_tm_1, 0); + __m128 _out0tm2 = (__m128)__lsx_vld(output0_tm_2, 0); + __m128 _out0tm3 = (__m128)__lsx_vld(output0_tm_3, 0); + + __m128 _tmp0m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm0, _out0tm1), _out0tm2); + __m128 _tmp1m = __lsx_vfadd_s(__lsx_vfsub_s(_out0tm1, _out0tm2), _out0tm3); + + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp1m, tmp[1][m], 0); + + output0_tm_0 += tiles * 4 * 4; + output0_tm_1 += tiles * 4 * 4; + output0_tm_2 += tiles * 4 * 4; + output0_tm_3 += tiles * 4 * 4; + } + + for (int m = 0; m < 2; m++) + { + __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0); + __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0); + __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0); + __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0); + + __m128 _out00 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp00, _tmp01), _tmp02)); + __m128 _out01 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfsub_s(_tmp01, _tmp02), _tmp03)); + + __lsx_vst(_out00, output0, 0); + __lsx_vst(_out01, output0 + 4, 0); + + output0 += outw * 4; + } + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_transform_pack4_int8.h b/src/layer/loongarch/convolution_winograd_transform_pack4_int8.h new file mode 100644 index 000000000000..8b31ce97a869 --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_transform_pack4_int8.h @@ -0,0 +1,166 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd43_transform_output_pack4_int8_lsx(const Mat& top_blob_tm, Mat& top_blob, const Option& opt) +{ + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + + const int w_tiles = outw / 4; + const int h_tiles = outh / 4; + const int tiles = w_tiles * h_tiles; + + // const float otm[4][6] = { + // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f} + // }; + + // 0 = r00 + (r01 + r02) + (r03 + r04) + // 1 = (r01 - r02) + (r03 - r04) * 2 + // 2 = (r01 + r02) + (r03 + r04) * 4 + // 3 = r05 + (r01 - r02) + (r03 - r04) * 8 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + const Mat out0_tm = top_blob_tm.channel(p); + Mat out0 = top_blob.channel(p); + + int tmp[4][6][4]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const int* output0_tm_0 = (const int*)out0_tm + (i * w_tiles + j) * 4; + const int* output0_tm_1 = output0_tm_0 + tiles * 4; + const int* output0_tm_2 = output0_tm_0 + tiles * 8; + const int* output0_tm_3 = output0_tm_0 + tiles * 12; + const int* output0_tm_4 = output0_tm_0 + tiles * 16; + const int* output0_tm_5 = output0_tm_0 + tiles * 20; + + int* output0 = out0.row(i * 4) + (j * 4) * 4; + + for (int m = 0; m < 5; m++) + { + __m128i _out0tm0 = __lsx_vld(output0_tm_0, 0); + __m128i _out0tm1 = __lsx_vld(output0_tm_1, 0); + __m128i _out0tm2 = __lsx_vld(output0_tm_2, 0); + __m128i _out0tm3 = __lsx_vld(output0_tm_3, 0); + __m128i _out0tm4 = __lsx_vld(output0_tm_4, 0); + __m128i _out0tm5 = __lsx_vld(output0_tm_5, 0); + + __m128i _tmp02a = __lsx_vadd_w(_out0tm1, _out0tm2); + __m128i _tmp13a = __lsx_vsub_w(_out0tm1, _out0tm2); + + __m128i _tmp02b = __lsx_vadd_w(_out0tm3, _out0tm4); + __m128i _tmp13b = __lsx_vsub_w(_out0tm3, _out0tm4); + + __m128i _tmp0m = __lsx_vadd_w(__lsx_vadd_w(_out0tm0, _tmp02a), _tmp02b); + __m128i _tmp1m = __lsx_vadd_w(_tmp13a, __lsx_vslli_w(_tmp13b, 1)); + __m128i _tmp2m = __lsx_vadd_w(_tmp02a, __lsx_vslli_w(_tmp02b, 2)); + __m128i _tmp3m = __lsx_vadd_w(__lsx_vadd_w(_tmp13a, __lsx_vslli_w(_out0tm5, 2)), __lsx_vslli_w(_tmp13b, 3)); + + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp1m, tmp[1][m], 0); + __lsx_vst(_tmp2m, tmp[2][m], 0); + __lsx_vst(_tmp3m, tmp[3][m], 0); + + output0_tm_0 += tiles * 24; + output0_tm_1 += tiles * 24; + output0_tm_2 += tiles * 24; + output0_tm_3 += tiles * 24; + output0_tm_4 += tiles * 24; + output0_tm_5 += tiles * 24; + } + for (int m = 5; m < 6; m++) + { + __m128i _out0tm0 = __lsx_vld(output0_tm_0, 0); + __m128i _out0tm1 = __lsx_vld(output0_tm_1, 0); + __m128i _out0tm2 = __lsx_vld(output0_tm_2, 0); + __m128i _out0tm3 = __lsx_vld(output0_tm_3, 0); + __m128i _out0tm4 = __lsx_vld(output0_tm_4, 0); + __m128i _out0tm5 = __lsx_vld(output0_tm_5, 0); + + __m128i _tmp02a = __lsx_vadd_w(_out0tm1, _out0tm2); + __m128i _tmp13a = __lsx_vsub_w(_out0tm1, _out0tm2); + + __m128i _tmp02b = __lsx_vadd_w(_out0tm3, _out0tm4); + __m128i _tmp13b = __lsx_vsub_w(_out0tm3, _out0tm4); + + __m128i _tmp0m = __lsx_vadd_w(__lsx_vadd_w(_out0tm0, _tmp02a), _tmp02b); + __m128i _tmp1m = __lsx_vadd_w(_tmp13a, __lsx_vslli_w(_tmp13b, 1)); + __m128i _tmp2m = __lsx_vadd_w(_tmp02a, __lsx_vslli_w(_tmp02b, 2)); + __m128i _tmp3m = __lsx_vadd_w(__lsx_vadd_w(_tmp13a, __lsx_vslli_w(_out0tm5, 2)), __lsx_vslli_w(_tmp13b, 3)); + + _tmp0m = __lsx_vslli_w(_tmp0m, 2); + _tmp1m = __lsx_vslli_w(_tmp1m, 2); + _tmp2m = __lsx_vslli_w(_tmp2m, 2); + _tmp3m = __lsx_vslli_w(_tmp3m, 2); + + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp1m, tmp[1][m], 0); + __lsx_vst(_tmp2m, tmp[2][m], 0); + __lsx_vst(_tmp3m, tmp[3][m], 0); + + output0_tm_0 += tiles * 24; + output0_tm_1 += tiles * 24; + output0_tm_2 += tiles * 24; + output0_tm_3 += tiles * 24; + output0_tm_4 += tiles * 24; + output0_tm_5 += tiles * 24; + } + + for (int m = 0; m < 4; m++) + { + __m128i _tmp00 = __lsx_vld(tmp[m][0], 0); + __m128i _tmp01 = __lsx_vld(tmp[m][1], 0); + __m128i _tmp02 = __lsx_vld(tmp[m][2], 0); + __m128i _tmp03 = __lsx_vld(tmp[m][3], 0); + __m128i _tmp04 = __lsx_vld(tmp[m][4], 0); + __m128i _tmp05 = __lsx_vld(tmp[m][5], 0); + + __m128i _tmp02a = __lsx_vadd_w(_tmp01, _tmp02); + __m128i _tmp13a = __lsx_vsub_w(_tmp01, _tmp02); + + __m128i _tmp02b = __lsx_vadd_w(_tmp03, _tmp04); + __m128i _tmp13b = __lsx_vsub_w(_tmp03, _tmp04); + + __m128i _out00 = __lsx_vadd_w(__lsx_vadd_w(_tmp00, _tmp02a), _tmp02b); + __m128i _out01 = __lsx_vadd_w(_tmp13a, __lsx_vslli_w(_tmp13b, 1)); + __m128i _out02 = __lsx_vadd_w(_tmp02a, __lsx_vslli_w(_tmp02b, 2)); + __m128i _out03 = __lsx_vadd_w(__lsx_vadd_w(_tmp05, _tmp13a), __lsx_vslli_w(_tmp13b, 3)); + + // TODO use integer trick for division by 576 + __m128 _v576 = __lsx_vreplfr2vr_s(1.0 / 576); + _out00 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out00), _v576)); + _out01 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out01), _v576)); + _out02 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out02), _v576)); + _out03 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out03), _v576)); + + __lsx_vst(_out00, output0, 0); + __lsx_vst(_out01, output0 + 4, 0); + __lsx_vst(_out02, output0 + 8, 0); + __lsx_vst(_out03, output0 + 12, 0); + + output0 += outw * 4; + } + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_transform_pack8_int8.h b/src/layer/loongarch/convolution_winograd_transform_pack8_int8.h new file mode 100644 index 000000000000..5e49a87669a6 --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_transform_pack8_int8.h @@ -0,0 +1,132 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd43_transform_input_pack8_int8_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + + const int w_tiles = (w - 2) / 4; + const int h_tiles = (h - 2) / 4; + const int tiles = w_tiles * h_tiles; + + // const float itm[6][6] = { + // {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f}, + // {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f}, + // {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f}, + // {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, 0.0f,-5.0f, 0.0f, 1.0f} + // }; + + // 0 = 4 * r00 - 5 * r02 + r04 + // 1 = -4 * (r01 + r02) + r04 + r03 + // 2 = 4 * (r01 - r02) + r04 - r03 + // 3 = -2 * (r01 - r03) + r04 - r02 + // 4 = 2 * (r01 - r03) + r04 - r02 + // 5 = 4 * r01 - 5 * r03 + r05 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < inch; q++) + { + const Mat img0 = bottom_blob.channel(q); + Mat img0_tm = bottom_blob_tm.channel(q); + + short tmp[6][6][8]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const signed char* r0 = img0.row(i * 4) + (j * 4) * 8; + + for (int m = 0; m < 6; m++) + { + __m128i _r00_01 = __lsx_vld(r0, 0); + __m128i _r02_03 = __lsx_vld(r0 + 16, 0); + __m128i _r04_05 = __lsx_vld(r0 + 32, 0); + __m128i _extr0001 = __lsx_vslti_b(_r00_01, 0); + __m128i _extr0203 = __lsx_vslti_b(_r02_03, 0); + __m128i _extr0405 = __lsx_vslti_b(_r04_05, 0); + __m128i _r00 = __lsx_vilvl_b(_extr0001, _r00_01); + __m128i _r01 = __lsx_vilvh_b(_extr0001, _r00_01); + __m128i _r02 = __lsx_vilvl_b(_extr0203, _r02_03); + __m128i _r03 = __lsx_vilvh_b(_extr0203, _r02_03); + __m128i _r04 = __lsx_vilvl_b(_extr0405, _r04_05); + __m128i _r05 = __lsx_vilvh_b(_extr0405, _r04_05); + + __m128i _v5 = __lsx_vreplgr2vr_h(5); + + __m128i _tmp0m = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_r00, 2), _r04), __lsx_vmul_h(_r02, _v5)); + __m128i _tmp1m = __lsx_vsub_h(__lsx_vadd_h(_r04, _r03), __lsx_vslli_h(__lsx_vadd_h(_r01, _r02), 2)); + __m128i _tmp2m = __lsx_vadd_h(__lsx_vsub_h(_r04, _r03), __lsx_vslli_h(__lsx_vsub_h(_r01, _r02), 2)); + __m128i _tmp3m = __lsx_vsub_h(__lsx_vsub_h(_r04, _r02), __lsx_vslli_h(__lsx_vsub_h(_r01, _r03), 1)); + __m128i _tmp4m = __lsx_vadd_h(__lsx_vsub_h(_r04, _r02), __lsx_vslli_h(__lsx_vsub_h(_r01, _r03), 1)); + __m128i _tmp5m = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_r01, 2), _r05), __lsx_vmul_h(_r03, _v5)); + + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp1m, tmp[1][m], 0); + __lsx_vst(_tmp2m, tmp[2][m], 0); + __lsx_vst(_tmp3m, tmp[3][m], 0); + __lsx_vst(_tmp4m, tmp[4][m], 0); + __lsx_vst(_tmp5m, tmp[5][m], 0); + + r0 += w * 8; + } + + short* r0_tm_0 = (short*)img0_tm + (i * w_tiles + j) * 8; + short* r0_tm_1 = r0_tm_0 + tiles * 8; + short* r0_tm_2 = r0_tm_0 + tiles * 16; + short* r0_tm_3 = r0_tm_0 + tiles * 24; + short* r0_tm_4 = r0_tm_0 + tiles * 32; + short* r0_tm_5 = r0_tm_0 + tiles * 40; + + for (int m = 0; m < 6; m++) + { + __m128i _tmp00 = __lsx_vld(tmp[m][0], 0); + __m128i _tmp01 = __lsx_vld(tmp[m][1], 0); + __m128i _tmp02 = __lsx_vld(tmp[m][2], 0); + __m128i _tmp03 = __lsx_vld(tmp[m][3], 0); + __m128i _tmp04 = __lsx_vld(tmp[m][4], 0); + __m128i _tmp05 = __lsx_vld(tmp[m][5], 0); + + __m128i _v5 = __lsx_vreplgr2vr_h(5); + + __m128i _r0tm0 = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_tmp00, 2), _tmp04), __lsx_vmul_h(_tmp02, _v5)); + __m128i _r0tm1 = __lsx_vsub_h(__lsx_vadd_h(_tmp04, _tmp03), __lsx_vslli_h(__lsx_vadd_h(_tmp01, _tmp02), 2)); + __m128i _r0tm2 = __lsx_vadd_h(__lsx_vsub_h(_tmp04, _tmp03), __lsx_vslli_h(__lsx_vsub_h(_tmp01, _tmp02), 2)); + __m128i _r0tm3 = __lsx_vsub_h(__lsx_vsub_h(_tmp04, _tmp02), __lsx_vslli_h(__lsx_vsub_h(_tmp01, _tmp03), 1)); + __m128i _r0tm4 = __lsx_vadd_h(__lsx_vsub_h(_tmp04, _tmp02), __lsx_vslli_h(__lsx_vsub_h(_tmp01, _tmp03), 1)); + __m128i _r0tm5 = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_tmp01, 2), _tmp05), __lsx_vmul_h(_tmp03, _v5)); + + __lsx_vst(_r0tm0, r0_tm_0, 0); + __lsx_vst(_r0tm1, r0_tm_1, 0); + __lsx_vst(_r0tm2, r0_tm_2, 0); + __lsx_vst(_r0tm3, r0_tm_3, 0); + __lsx_vst(_r0tm4, r0_tm_4, 0); + __lsx_vst(_r0tm5, r0_tm_5, 0); + + r0_tm_0 += tiles * 48; + r0_tm_1 += tiles * 48; + r0_tm_2 += tiles * 48; + r0_tm_3 += tiles * 48; + r0_tm_4 += tiles * 48; + r0_tm_5 += tiles * 48; + } + } + } + } +} diff --git a/src/layer/loongarch/convolutiondepthwise_3x3.h b/src/layer/loongarch/convolutiondepthwise_3x3.h new file mode 100644 index 000000000000..1c37f7789f3b --- /dev/null +++ b/src/layer/loongarch/convolutiondepthwise_3x3.h @@ -0,0 +1,193 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convdw3x3s1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int group = bottom_blob.c; + + const float* kernel = _kernel; + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + Mat out = top_blob.channel(g); + + const float bias0 = bias ? bias[g] : 0.f; + + const float* kernel0 = kernel + g * 9; + + float* outptr0 = out; + float* outptr1 = outptr0 + outw; + + const float* img0 = bottom_blob.channel(g); + + const float* r0 = img0; + const float* r1 = img0 + w; + const float* r2 = img0 + w * 2; + const float* r3 = img0 + w * 3; + + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + int i = 0; + + for (; i + 1 < outh; i += 2) + { + for (int j = 0; j < outw; j++) + { + float sum = bias0; + float sum2 = bias0; + + sum += r0[0] * k0[0]; + sum += r0[1] * k0[1]; + sum += r0[2] * k0[2]; + sum2 += r1[0] * k0[0]; + sum2 += r1[1] * k0[1]; + sum2 += r1[2] * k0[2]; + sum += r1[0] * k1[0]; + sum += r1[1] * k1[1]; + sum += r1[2] * k1[2]; + sum2 += r2[0] * k1[0]; + sum2 += r2[1] * k1[1]; + sum2 += r2[2] * k1[2]; + sum += r2[0] * k2[0]; + sum += r2[1] * k2[1]; + sum += r2[2] * k2[2]; + sum2 += r3[0] * k2[0]; + sum2 += r3[1] * k2[1]; + sum2 += r3[2] * k2[2]; + + *outptr0 = sum; + *outptr1 = sum2; + + r0++; + r1++; + r2++; + r3++; + outptr0++; + outptr1++; + } + + r0 += 2 + w; + r1 += 2 + w; + r2 += 2 + w; + r3 += 2 + w; + + outptr0 += outw; + outptr1 += outw; + } + + for (; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = bias0; + sum += r0[0] * k0[0]; + sum += r0[1] * k0[1]; + sum += r0[2] * k0[2]; + sum += r1[0] * k1[0]; + sum += r1[1] * k1[1]; + sum += r1[2] * k1[2]; + sum += r2[0] * k2[0]; + sum += r2[1] * k2[1]; + sum += r2[2] * k2[2]; + + *outptr0 = sum; + + r0++; + r1++; + r2++; + outptr0++; + } + + r0 += 2; + r1 += 2; + r2 += 2; + } + } +} + +static void convdw3x3s2_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int group = bottom_blob.c; + + const int tailstep = w - 2 * outw + w; + + const float* kernel = _kernel; + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + Mat out = top_blob.channel(g); + + const float bias0 = bias ? bias[g] : 0.f; + + const float* kernel0 = kernel + g * 9; + + float* outptr = out; + + const float* img0 = bottom_blob.channel(g); + + const float* r0 = img0; + const float* r1 = img0 + w; + const float* r2 = img0 + w * 2; + + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + int i = 0; + + for (; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = bias0; + sum += r0[0] * k0[0]; + sum += r0[1] * k0[1]; + sum += r0[2] * k0[2]; + sum += r1[0] * k1[0]; + sum += r1[1] * k1[1]; + sum += r1[2] * k1[2]; + sum += r2[0] * k2[0]; + sum += r2[1] * k2[1]; + sum += r2[2] * k2[2]; + + *outptr = sum; + + r0 += 2; + r1 += 2; + r2 += 2; + outptr++; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + } +} diff --git a/src/layer/loongarch/convolutiondepthwise_3x3_pack4.h b/src/layer/loongarch/convolutiondepthwise_3x3_pack4.h new file mode 100644 index 000000000000..48ae66412fc1 --- /dev/null +++ b/src/layer/loongarch/convolutiondepthwise_3x3_pack4.h @@ -0,0 +1,464 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convdw3x3s1_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int group = bottom_blob.c; + + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + Mat out = top_blob.channel(g); + + __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + + const float* k0 = kernel.row(g); + + float* outptr0 = out.row(0); + float* outptr1 = out.row(1); + + const Mat img0 = bottom_blob.channel(g); + + const float* r0 = img0.row(0); + const float* r1 = img0.row(1); + const float* r2 = img0.row(2); + const float* r3 = img0.row(3); + + __m128 _k00 = (__m128)__lsx_vld(k0, 0); + __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0); + __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0); + __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0); + __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0); + + int i = 0; + for (; i + 1 < outh; i += 2) + { + int j = 0; + for (; j + 1 < outw; j += 2) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(r1 + 32); + __builtin_prefetch(r2 + 32); + __builtin_prefetch(r3 + 32); + + __m128 _sum00 = _bias0; + __m128 _sum01 = _bias0; + __m128 _sum10 = _bias0; + __m128 _sum11 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + + _sum00 = __lsx_vfmadd_s(_r00, _k00, _sum00); + _sum00 = __lsx_vfmadd_s(_r01, _k01, _sum00); + _sum00 = __lsx_vfmadd_s(_r02, _k02, _sum00); + _sum01 = __lsx_vfmadd_s(_r01, _k00, _sum01); + _sum01 = __lsx_vfmadd_s(_r02, _k01, _sum01); + _sum01 = __lsx_vfmadd_s(_r03, _k02, _sum01); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0); + + _sum00 = __lsx_vfmadd_s(_r10, _k10, _sum00); + _sum00 = __lsx_vfmadd_s(_r11, _k11, _sum00); + _sum00 = __lsx_vfmadd_s(_r12, _k12, _sum00); + _sum01 = __lsx_vfmadd_s(_r11, _k10, _sum01); + _sum01 = __lsx_vfmadd_s(_r12, _k11, _sum01); + _sum01 = __lsx_vfmadd_s(_r13, _k12, _sum01); + _sum10 = __lsx_vfmadd_s(_r10, _k00, _sum10); + _sum10 = __lsx_vfmadd_s(_r11, _k01, _sum10); + _sum10 = __lsx_vfmadd_s(_r12, _k02, _sum10); + _sum11 = __lsx_vfmadd_s(_r11, _k00, _sum11); + _sum11 = __lsx_vfmadd_s(_r12, _k01, _sum11); + _sum11 = __lsx_vfmadd_s(_r13, _k02, _sum11); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0); + + _sum00 = __lsx_vfmadd_s(_r20, _k20, _sum00); + _sum00 = __lsx_vfmadd_s(_r21, _k21, _sum00); + _sum00 = __lsx_vfmadd_s(_r22, _k22, _sum00); + _sum01 = __lsx_vfmadd_s(_r21, _k20, _sum01); + _sum01 = __lsx_vfmadd_s(_r22, _k21, _sum01); + _sum01 = __lsx_vfmadd_s(_r23, _k22, _sum01); + _sum10 = __lsx_vfmadd_s(_r20, _k10, _sum10); + _sum10 = __lsx_vfmadd_s(_r21, _k11, _sum10); + _sum10 = __lsx_vfmadd_s(_r22, _k12, _sum10); + _sum11 = __lsx_vfmadd_s(_r21, _k10, _sum11); + _sum11 = __lsx_vfmadd_s(_r22, _k11, _sum11); + _sum11 = __lsx_vfmadd_s(_r23, _k12, _sum11); + + __m128 _r30 = (__m128)__lsx_vld(r3, 0); + __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0); + __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0); + __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0); + + _sum10 = __lsx_vfmadd_s(_r30, _k20, _sum10); + _sum10 = __lsx_vfmadd_s(_r31, _k21, _sum10); + _sum10 = __lsx_vfmadd_s(_r32, _k22, _sum10); + _sum11 = __lsx_vfmadd_s(_r31, _k20, _sum11); + _sum11 = __lsx_vfmadd_s(_r32, _k21, _sum11); + _sum11 = __lsx_vfmadd_s(_r33, _k22, _sum11); + + __lsx_vst(_sum00, outptr0, 0); + __lsx_vst(_sum01, outptr0 + 4, 0); + __lsx_vst(_sum10, outptr1, 0); + __lsx_vst(_sum11, outptr1 + 4, 0); + + outptr0 += 4 * 2; + outptr1 += 4 * 2; + + r0 += 4 * 2; + r1 += 4 * 2; + r2 += 4 * 2; + r3 += 4 * 2; + } + for (; j < outw; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(r1 + 16); + __builtin_prefetch(r2 + 16); + __builtin_prefetch(r3 + 16); + + __m128 _sum0 = _bias0; + __m128 _sum1 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0); + _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0); + _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0); + _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0); + _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0); + _sum1 = __lsx_vfmadd_s(_r10, _k00, _sum1); + _sum1 = __lsx_vfmadd_s(_r11, _k01, _sum1); + _sum1 = __lsx_vfmadd_s(_r12, _k02, _sum1); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0); + _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0); + _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0); + _sum1 = __lsx_vfmadd_s(_r20, _k10, _sum1); + _sum1 = __lsx_vfmadd_s(_r21, _k11, _sum1); + _sum1 = __lsx_vfmadd_s(_r22, _k12, _sum1); + + __m128 _r30 = (__m128)__lsx_vld(r3, 0); + __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0); + __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0); + + _sum1 = __lsx_vfmadd_s(_r30, _k20, _sum1); + _sum1 = __lsx_vfmadd_s(_r31, _k21, _sum1); + _sum1 = __lsx_vfmadd_s(_r32, _k22, _sum1); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr1, 0); + + outptr0 += 4; + outptr1 += 4; + + r0 += 4; + r1 += 4; + r2 += 4; + r3 += 4; + } + + r0 += 2 * 4 + w * 4; + r1 += 2 * 4 + w * 4; + r2 += 2 * 4 + w * 4; + r3 += 2 * 4 + w * 4; + + outptr0 += outw * 4; + outptr1 += outw * 4; + } + for (; i < outh; i++) + { + int j = 0; + for (; j + 1 < outw; j += 2) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(r1 + 32); + __builtin_prefetch(r2 + 32); + + __m128 _sum00 = _bias0; + __m128 _sum01 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + + _sum00 = __lsx_vfmadd_s(_r00, _k00, _sum00); + _sum00 = __lsx_vfmadd_s(_r01, _k01, _sum00); + _sum00 = __lsx_vfmadd_s(_r02, _k02, _sum00); + _sum01 = __lsx_vfmadd_s(_r01, _k00, _sum01); + _sum01 = __lsx_vfmadd_s(_r02, _k01, _sum01); + _sum01 = __lsx_vfmadd_s(_r03, _k02, _sum01); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0); + + _sum00 = __lsx_vfmadd_s(_r10, _k10, _sum00); + _sum00 = __lsx_vfmadd_s(_r11, _k11, _sum00); + _sum00 = __lsx_vfmadd_s(_r12, _k12, _sum00); + _sum01 = __lsx_vfmadd_s(_r11, _k10, _sum01); + _sum01 = __lsx_vfmadd_s(_r12, _k11, _sum01); + _sum01 = __lsx_vfmadd_s(_r13, _k12, _sum01); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0); + + _sum00 = __lsx_vfmadd_s(_r20, _k20, _sum00); + _sum00 = __lsx_vfmadd_s(_r21, _k21, _sum00); + _sum00 = __lsx_vfmadd_s(_r22, _k22, _sum00); + _sum01 = __lsx_vfmadd_s(_r21, _k20, _sum01); + _sum01 = __lsx_vfmadd_s(_r22, _k21, _sum01); + _sum01 = __lsx_vfmadd_s(_r23, _k22, _sum01); + + __lsx_vst(_sum00, outptr0, 0); + __lsx_vst(_sum01, outptr0 + 4, 0); + + outptr0 += 4 * 2; + + r0 += 4 * 2; + r1 += 4 * 2; + r2 += 4 * 2; + } + for (; j < outw; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(r1 + 16); + __builtin_prefetch(r2 + 16); + + __m128 _sum0 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0); + _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0); + _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0); + _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0); + _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0); + _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0); + _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0); + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; + + r0 += 4; + r1 += 4; + r2 += 4; + } + + r0 += 2 * 4; + r1 += 2 * 4; + r2 += 2 * 4; + } + } +} + +static void convdw3x3s2_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int group = bottom_blob.c; + + const int tailstep = (w - 2 * outw + w) * 4; + + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + Mat out = top_blob.channel(g); + + __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + + const float* k0 = kernel.row(g); + + float* outptr0 = out; + + const Mat img0 = bottom_blob.channel(g); + + const float* r0 = img0.row(0); + const float* r1 = img0.row(1); + const float* r2 = img0.row(2); + + __m128 _k00 = (__m128)__lsx_vld(k0, 0); + __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0); + __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0); + __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0); + __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0); + + int i = 0; + for (; i < outh; i++) + { + int j = 0; + for (; j + 1 < outw; j += 2) + { + __builtin_prefetch(r0 + 64); + __builtin_prefetch(r1 + 64); + __builtin_prefetch(r2 + 64); + + __m128 _sum00 = _bias0; + __m128 _sum01 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0); + + _sum00 = __lsx_vfmadd_s(_r00, _k00, _sum00); + _sum00 = __lsx_vfmadd_s(_r01, _k01, _sum00); + _sum00 = __lsx_vfmadd_s(_r02, _k02, _sum00); + _sum01 = __lsx_vfmadd_s(_r02, _k00, _sum01); + _sum01 = __lsx_vfmadd_s(_r03, _k01, _sum01); + _sum01 = __lsx_vfmadd_s(_r04, _k02, _sum01); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0); + __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0); + + _sum00 = __lsx_vfmadd_s(_r10, _k10, _sum00); + _sum00 = __lsx_vfmadd_s(_r11, _k11, _sum00); + _sum00 = __lsx_vfmadd_s(_r12, _k12, _sum00); + _sum01 = __lsx_vfmadd_s(_r12, _k10, _sum01); + _sum01 = __lsx_vfmadd_s(_r13, _k11, _sum01); + _sum01 = __lsx_vfmadd_s(_r14, _k12, _sum01); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0); + __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0); + + _sum00 = __lsx_vfmadd_s(_r20, _k20, _sum00); + _sum00 = __lsx_vfmadd_s(_r21, _k21, _sum00); + _sum00 = __lsx_vfmadd_s(_r22, _k22, _sum00); + _sum01 = __lsx_vfmadd_s(_r22, _k20, _sum01); + _sum01 = __lsx_vfmadd_s(_r23, _k21, _sum01); + _sum01 = __lsx_vfmadd_s(_r24, _k22, _sum01); + + __lsx_vst(_sum00, outptr0, 0); + __lsx_vst(_sum01, outptr0 + 4, 0); + + outptr0 += 4 * 2; + + r0 += 4 * 4; + r1 += 4 * 4; + r2 += 4 * 4; + } + for (; j < outw; j++) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(r1 + 32); + __builtin_prefetch(r2 + 32); + + __m128 _sum0 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0); + _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0); + _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0); + _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0); + _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0); + _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0); + _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0); + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; + + r0 += 4 * 2; + r1 += 4 * 2; + r2 += 4 * 2; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + } +} diff --git a/src/layer/loongarch/convolutiondepthwise_5x5_pack4.h b/src/layer/loongarch/convolutiondepthwise_5x5_pack4.h new file mode 100644 index 000000000000..4f94c5e69958 --- /dev/null +++ b/src/layer/loongarch/convolutiondepthwise_5x5_pack4.h @@ -0,0 +1,511 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convdw5x5s1_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int group = bottom_blob.c; + + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + Mat out = top_blob.channel(g); + + __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + + const float* k0 = kernel.row(g); + + float* outptr0 = out.row(0); + float* outptr1 = out.row(1); + + const Mat img0 = bottom_blob.channel(g); + + const float* r0 = img0.row(0); + const float* r1 = img0.row(1); + const float* r2 = img0.row(2); + const float* r3 = img0.row(3); + const float* r4 = img0.row(4); + const float* r5 = img0.row(5); + + int i = 0; + for (; i + 1 < outh; i += 2) + { + int j = 0; + for (; j < outw; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(r1 + 16); + __builtin_prefetch(r2 + 16); + __builtin_prefetch(r3 + 16); + __builtin_prefetch(r4 + 16); + __builtin_prefetch(r5 + 16); + + __builtin_prefetch(k0 + 400); + + __m128 _sum0 = _bias0; + __m128 _sum1 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0); + + __m128 _k00 = (__m128)__lsx_vld(k0, 0); + __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k03 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k04 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0); + _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0); + _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0); + _sum0 = __lsx_vfmadd_s(_r03, _k03, _sum0); + _sum0 = __lsx_vfmadd_s(_r04, _k04, _sum0); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0); + __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0); + + _sum1 = __lsx_vfmadd_s(_r10, _k00, _sum1); + _sum1 = __lsx_vfmadd_s(_r11, _k01, _sum1); + _sum1 = __lsx_vfmadd_s(_r12, _k02, _sum1); + _sum1 = __lsx_vfmadd_s(_r13, _k03, _sum1); + _sum1 = __lsx_vfmadd_s(_r14, _k04, _sum1); + + __m128 _k10 = (__m128)__lsx_vld(k0, 0); + __m128 _k11 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k13 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k14 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0); + _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0); + _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0); + _sum0 = __lsx_vfmadd_s(_r13, _k13, _sum0); + _sum0 = __lsx_vfmadd_s(_r14, _k14, _sum0); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0); + __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0); + + _sum1 = __lsx_vfmadd_s(_r20, _k10, _sum1); + _sum1 = __lsx_vfmadd_s(_r21, _k11, _sum1); + _sum1 = __lsx_vfmadd_s(_r22, _k12, _sum1); + _sum1 = __lsx_vfmadd_s(_r23, _k13, _sum1); + _sum1 = __lsx_vfmadd_s(_r24, _k14, _sum1); + + __m128 _k20 = (__m128)__lsx_vld(k0, 0); + __m128 _k21 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k23 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k24 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0); + _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0); + _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0); + _sum0 = __lsx_vfmadd_s(_r23, _k23, _sum0); + _sum0 = __lsx_vfmadd_s(_r24, _k24, _sum0); + + __m128 _r30 = (__m128)__lsx_vld(r3, 0); + __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0); + __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0); + __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0); + __m128 _r34 = (__m128)__lsx_vld(r3 + 4 * 4, 0); + + _sum1 = __lsx_vfmadd_s(_r30, _k20, _sum1); + _sum1 = __lsx_vfmadd_s(_r31, _k21, _sum1); + _sum1 = __lsx_vfmadd_s(_r32, _k22, _sum1); + _sum1 = __lsx_vfmadd_s(_r33, _k23, _sum1); + _sum1 = __lsx_vfmadd_s(_r34, _k24, _sum1); + + __m128 _k30 = (__m128)__lsx_vld(k0, 0); + __m128 _k31 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k32 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k33 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k34 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r30, _k30, _sum0); + _sum0 = __lsx_vfmadd_s(_r31, _k31, _sum0); + _sum0 = __lsx_vfmadd_s(_r32, _k32, _sum0); + _sum0 = __lsx_vfmadd_s(_r33, _k33, _sum0); + _sum0 = __lsx_vfmadd_s(_r34, _k34, _sum0); + + __m128 _r40 = (__m128)__lsx_vld(r4, 0); + __m128 _r41 = (__m128)__lsx_vld(r4 + 4, 0); + __m128 _r42 = (__m128)__lsx_vld(r4 + 4 * 2, 0); + __m128 _r43 = (__m128)__lsx_vld(r4 + 4 * 3, 0); + __m128 _r44 = (__m128)__lsx_vld(r4 + 4 * 4, 0); + + _sum1 = __lsx_vfmadd_s(_r40, _k30, _sum1); + _sum1 = __lsx_vfmadd_s(_r41, _k31, _sum1); + _sum1 = __lsx_vfmadd_s(_r42, _k32, _sum1); + _sum1 = __lsx_vfmadd_s(_r43, _k33, _sum1); + _sum1 = __lsx_vfmadd_s(_r44, _k34, _sum1); + + __m128 _k40 = (__m128)__lsx_vld(k0, 0); + __m128 _k41 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k42 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k43 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k44 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 -= 4 * 20; + + _sum0 = __lsx_vfmadd_s(_r40, _k40, _sum0); + _sum0 = __lsx_vfmadd_s(_r41, _k41, _sum0); + _sum0 = __lsx_vfmadd_s(_r42, _k42, _sum0); + _sum0 = __lsx_vfmadd_s(_r43, _k43, _sum0); + _sum0 = __lsx_vfmadd_s(_r44, _k44, _sum0); + + __m128 _r50 = (__m128)__lsx_vld(r5, 0); + __m128 _r51 = (__m128)__lsx_vld(r5 + 4, 0); + __m128 _r52 = (__m128)__lsx_vld(r5 + 4 * 2, 0); + __m128 _r53 = (__m128)__lsx_vld(r5 + 4 * 3, 0); + __m128 _r54 = (__m128)__lsx_vld(r5 + 4 * 4, 0); + + _sum1 = __lsx_vfmadd_s(_r50, _k40, _sum1); + _sum1 = __lsx_vfmadd_s(_r51, _k41, _sum1); + _sum1 = __lsx_vfmadd_s(_r52, _k42, _sum1); + _sum1 = __lsx_vfmadd_s(_r53, _k43, _sum1); + _sum1 = __lsx_vfmadd_s(_r54, _k44, _sum1); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr1, 0); + + outptr0 += 4; + outptr1 += 4; + + r0 += 4; + r1 += 4; + r2 += 4; + r3 += 4; + r4 += 4; + r5 += 4; + } + + r0 += 4 * 4 + w * 4; + r1 += 4 * 4 + w * 4; + r2 += 4 * 4 + w * 4; + r3 += 4 * 4 + w * 4; + r4 += 4 * 4 + w * 4; + r5 += 4 * 4 + w * 4; + + outptr0 += outw * 4; + outptr1 += outw * 4; + } + for (; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(r1 + 16); + __builtin_prefetch(r2 + 16); + __builtin_prefetch(r3 + 16); + __builtin_prefetch(r4 + 16); + + __builtin_prefetch(k0 + 400); + + __m128 _sum0 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0); + + __m128 _k00 = (__m128)__lsx_vld(k0, 0); + __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k03 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k04 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0); + _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0); + _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0); + _sum0 = __lsx_vfmadd_s(_r03, _k03, _sum0); + _sum0 = __lsx_vfmadd_s(_r04, _k04, _sum0); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0); + __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0); + + __m128 _k10 = (__m128)__lsx_vld(k0, 0); + __m128 _k11 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k13 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k14 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0); + _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0); + _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0); + _sum0 = __lsx_vfmadd_s(_r13, _k13, _sum0); + _sum0 = __lsx_vfmadd_s(_r14, _k14, _sum0); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0); + __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0); + + __m128 _k20 = (__m128)__lsx_vld(k0, 0); + __m128 _k21 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k23 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k24 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0); + _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0); + _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0); + _sum0 = __lsx_vfmadd_s(_r23, _k23, _sum0); + _sum0 = __lsx_vfmadd_s(_r24, _k24, _sum0); + + __m128 _r30 = (__m128)__lsx_vld(r3, 0); + __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0); + __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0); + __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0); + __m128 _r34 = (__m128)__lsx_vld(r3 + 4 * 4, 0); + + __m128 _k30 = (__m128)__lsx_vld(k0, 0); + __m128 _k31 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k32 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k33 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k34 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r30, _k30, _sum0); + _sum0 = __lsx_vfmadd_s(_r31, _k31, _sum0); + _sum0 = __lsx_vfmadd_s(_r32, _k32, _sum0); + _sum0 = __lsx_vfmadd_s(_r33, _k33, _sum0); + _sum0 = __lsx_vfmadd_s(_r34, _k34, _sum0); + + __m128 _r40 = (__m128)__lsx_vld(r4, 0); + __m128 _r41 = (__m128)__lsx_vld(r4 + 4, 0); + __m128 _r42 = (__m128)__lsx_vld(r4 + 4 * 2, 0); + __m128 _r43 = (__m128)__lsx_vld(r4 + 4 * 3, 0); + __m128 _r44 = (__m128)__lsx_vld(r4 + 4 * 4, 0); + + __m128 _k40 = (__m128)__lsx_vld(k0, 0); + __m128 _k41 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k42 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k43 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k44 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 -= 4 * 20; + + _sum0 = __lsx_vfmadd_s(_r40, _k40, _sum0); + _sum0 = __lsx_vfmadd_s(_r41, _k41, _sum0); + _sum0 = __lsx_vfmadd_s(_r42, _k42, _sum0); + _sum0 = __lsx_vfmadd_s(_r43, _k43, _sum0); + _sum0 = __lsx_vfmadd_s(_r44, _k44, _sum0); + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; + + r0 += 4; + r1 += 4; + r2 += 4; + r3 += 4; + r4 += 4; + } + + r0 += 4 * 4; + r1 += 4 * 4; + r2 += 4 * 4; + r3 += 4 * 4; + r4 += 4 * 4; + } + } +} + +static void convdw5x5s2_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int group = bottom_blob.c; + + const int tailstep = (w - 2 * outw + w) * 4; + + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + Mat out = top_blob.channel(g); + + __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + + const float* k0 = kernel.row(g); + + float* outptr0 = out; + + const Mat img0 = bottom_blob.channel(g); + + const float* r0 = img0.row(0); + const float* r1 = img0.row(1); + const float* r2 = img0.row(2); + const float* r3 = img0.row(3); + const float* r4 = img0.row(4); + + int i = 0; + for (; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(r1 + 32); + __builtin_prefetch(r2 + 32); + __builtin_prefetch(r3 + 32); + __builtin_prefetch(r4 + 32); + + __builtin_prefetch(k0 + 400); + + __m128 _sum0 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0); + + __m128 _k00 = (__m128)__lsx_vld(k0, 0); + __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k03 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k04 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0); + _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0); + _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0); + _sum0 = __lsx_vfmadd_s(_r03, _k03, _sum0); + _sum0 = __lsx_vfmadd_s(_r04, _k04, _sum0); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0); + __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0); + + __m128 _k10 = (__m128)__lsx_vld(k0, 0); + __m128 _k11 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k13 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k14 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0); + _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0); + _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0); + _sum0 = __lsx_vfmadd_s(_r13, _k13, _sum0); + _sum0 = __lsx_vfmadd_s(_r14, _k14, _sum0); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0); + __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0); + + __m128 _k20 = (__m128)__lsx_vld(k0, 0); + __m128 _k21 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k23 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k24 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0); + _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0); + _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0); + _sum0 = __lsx_vfmadd_s(_r23, _k23, _sum0); + _sum0 = __lsx_vfmadd_s(_r24, _k24, _sum0); + + __m128 _r30 = (__m128)__lsx_vld(r3, 0); + __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0); + __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0); + __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0); + __m128 _r34 = (__m128)__lsx_vld(r3 + 4 * 4, 0); + + __m128 _k30 = (__m128)__lsx_vld(k0, 0); + __m128 _k31 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k32 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k33 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k34 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r30, _k30, _sum0); + _sum0 = __lsx_vfmadd_s(_r31, _k31, _sum0); + _sum0 = __lsx_vfmadd_s(_r32, _k32, _sum0); + _sum0 = __lsx_vfmadd_s(_r33, _k33, _sum0); + _sum0 = __lsx_vfmadd_s(_r34, _k34, _sum0); + + __m128 _r40 = (__m128)__lsx_vld(r4, 0); + __m128 _r41 = (__m128)__lsx_vld(r4 + 4, 0); + __m128 _r42 = (__m128)__lsx_vld(r4 + 4 * 2, 0); + __m128 _r43 = (__m128)__lsx_vld(r4 + 4 * 3, 0); + __m128 _r44 = (__m128)__lsx_vld(r4 + 4 * 4, 0); + + __m128 _k40 = (__m128)__lsx_vld(k0, 0); + __m128 _k41 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k42 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k43 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k44 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 -= 4 * 20; + + _sum0 = __lsx_vfmadd_s(_r40, _k40, _sum0); + _sum0 = __lsx_vfmadd_s(_r41, _k41, _sum0); + _sum0 = __lsx_vfmadd_s(_r42, _k42, _sum0); + _sum0 = __lsx_vfmadd_s(_r43, _k43, _sum0); + _sum0 = __lsx_vfmadd_s(_r44, _k44, _sum0); + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; + + r0 += 4 * 2; + r1 += 4 * 2; + r2 += 4 * 2; + r3 += 4 * 2; + r4 += 4 * 2; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + r3 += tailstep; + r4 += tailstep; + } + } +} diff --git a/src/layer/loongarch/convolutiondepthwise_loongarch.cpp b/src/layer/loongarch/convolutiondepthwise_loongarch.cpp new file mode 100644 index 000000000000..4d134cc4a39a --- /dev/null +++ b/src/layer/loongarch/convolutiondepthwise_loongarch.cpp @@ -0,0 +1,966 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolutiondepthwise_loongarch.h" + +#include "layer_type.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_activation.h" +#include "loongarch_usability.h" + +namespace ncnn { + +#include "convolutiondepthwise_3x3.h" + +#if __loongarch_sx +#include "convolutiondepthwise_3x3_pack4.h" +#include "convolutiondepthwise_5x5_pack4.h" +#endif // __loongarch_sx + +ConvolutionDepthWise_loongarch::ConvolutionDepthWise_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx + + activation = 0; +} + +int ConvolutionDepthWise_loongarch::create_pipeline(const Option& opt) +{ + if (dynamic_weight) + return 0; + + activation = create_activation_layer(activation_type, activation_params, opt); + +#if NCNN_INT8 + if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) + { + return create_pipeline_int8_loongarch(opt); + } +#endif + + const int maxk = kernel_w * kernel_h; + int channels = (weight_data_size / group) / maxk / (num_output / group) * group; + + // depth-wise + if (channels == group && group == num_output) + { + int elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + elempack = channels % 4 == 0 ? 4 : 1; + } +#endif + +#if __loongarch_sx + // pack4 + if (elempack == 4) + { + Mat weight_data_r2 = weight_data.reshape(maxk, group); + convert_packing(weight_data_r2, weight_data_tm, 4, opt); + } +#endif // __loongarch_sx + + if (elempack == 1) + { + weight_data_tm = weight_data; + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; + } + + // group convolution + create_group_ops(opt); + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int ConvolutionDepthWise_loongarch::create_group_ops(const Option& opt) +{ + // create Convolution op for each group + const int maxk = kernel_w * kernel_h; + int channels = (weight_data_size / group) / maxk / (num_output / group) * group; + + for (int i = 0; i < (int)group_ops.size(); i++) + delete group_ops[i]; + + group_ops.clear(); + + const int channels_g = channels / group; + const int num_output_g = num_output / group; + + group_ops.resize(group); + + for (int g = 0; g < group; g++) + { + Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone(); + Mat bias_data_g; + if (bias_term) + bias_data_g = bias_data.range(num_output_g * g, num_output_g); + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution); + + // set param + ncnn::ParamDict pd; + pd.set(0, num_output_g); // num_output + pd.set(1, kernel_w); + pd.set(11, kernel_h); + pd.set(2, dilation_w); + pd.set(12, dilation_h); + pd.set(3, stride_w); + pd.set(13, stride_h); + pd.set(4, 0); // pad_w + pd.set(14, 0); // pad_h + pd.set(5, bias_term); + pd.set(6, maxk * channels_g * num_output_g); // weight_data_size + pd.set(8, int8_scale_term); + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + // set weights + if (bias_term) + { + ncnn::Mat weights[5]; + weights[0] = weight_data_g; + weights[1] = bias_data_g; + +#if NCNN_INT8 + if (int8_scale_term) + { + Mat weight_data_int8_scales_g(num_output_g); + weight_data_int8_scales_g.fill(weight_data_int8_scales[g]); + weights[2] = weight_data_int8_scales_g; + weights[3] = bottom_blob_int8_scales.range(g, 1); + } + if (int8_scale_term > 100) + { + weights[4] = top_blob_int8_scales.range(g, 1); + } +#endif + + op->load_model(ModelBinFromMatArray(weights)); + } + else + { + ncnn::Mat weights[4]; + weights[0] = weight_data_g; + +#if NCNN_INT8 + if (int8_scale_term) + { + Mat weight_data_int8_scales_g(num_output_g); + weight_data_int8_scales_g.fill(weight_data_int8_scales[g]); + weights[1] = weight_data_int8_scales_g; + weights[2] = bottom_blob_int8_scales.range(g, 1); + } + if (int8_scale_term > 100) + { + weights[3] = top_blob_int8_scales.range(g, 1); + } +#endif + + op->load_model(ModelBinFromMatArray(weights)); + } + + op->create_pipeline(opt); + + group_ops[g] = op; + } + + return 0; +} + +int ConvolutionDepthWise_loongarch::destroy_pipeline(const Option& opt) +{ + if (activation) + { + activation->destroy_pipeline(opt); + delete activation; + activation = 0; + } + + for (int i = 0; i < (int)group_ops.size(); i++) + { + group_ops[i]->destroy_pipeline(opt); + delete group_ops[i]; + } + group_ops.clear(); + + return 0; +} + +int ConvolutionDepthWise_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if NCNN_INT8 + if (opt.use_int8_inference && int8_scale_term) + { + return forward_int8_loongarch(bottom_blob, top_blob, opt); + } +#endif + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // depth-wise + if (channels * elempack == group && group == num_output) + { +#if __loongarch_sx + if (elempack == 4) + { + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convdw3x3s1_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convdw3x3s2_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convdw5x5s1_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convdw5x5s2_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + float* outptr = top_blob.channel(g); + const float* kptr = (const float*)weight_data_tm + maxk * g * 4; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum = (__m128)__lsx_vld((const float*)bias_data + g * 4, 0); + } + + const float* sptr = m.row(i * stride_h) + j * stride_w * 4; + + for (int k = 0; k < maxk; k++) + { + __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0); + __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0); + _sum = __lsx_vfmadd_s(_w, _val, _sum); + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } + } + } +#endif // __loongarch_sx + + if (elempack == 1) + { + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convdw3x3s1_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convdw3x3s2_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + float* outptr = top_blob.channel(g); + const float* kptr = (const float*)weight_data_tm + maxk * g; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + sum = bias_data[g]; + + const float* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + float val = (float)sptr[space_ofs[k]]; + float w = (float)kptr[k]; + sum += val * w; + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = sum; + } + + outptr += outw; + } + } + } + } + + return 0; + } + + // group convolution + const int channels_g = channels * elempack / group; + const int num_output_g = num_output / group; + + int g_elempack = 1; + int out_g_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + g_elempack = channels_g % 4 == 0 ? 4 : 1; + out_g_elempack = num_output_g % 4 == 0 ? 4 : 1; + } +#endif + + // unpacking + Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; + if (elempack > g_elempack) + { + Option opt_p = opt; + opt_p.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p); + } + + Mat top_blob_unpacked = top_blob; + if (out_g_elempack < out_elempack) + { + top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator); + if (top_blob_unpacked.empty()) + return -100; + } + + for (int g = 0; g < group; g++) + { + const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); + Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack); + + const ncnn::Layer* op = group_ops[g]; + + Option opt_g = opt; + opt_g.blob_allocator = top_blob_unpacked.allocator; + + // forward + op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + } + + // packing + if (out_g_elempack < out_elempack) + { + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + } + else + { + top_blob = top_blob_unpacked; + } + + return 0; +} + +int ConvolutionDepthWise_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& _weight_data = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + const int _kernel_w = _weight_data.w; + const int _kernel_h = _weight_data.h; + const int _num_output = _weight_data.c * _weight_data.elempack; + + Mat weight_data_flattened; + flatten(_weight_data, weight_data_flattened, opt); + if (weight_data_flattened.empty()) + return -100; + + // weight_data_flattened as pack1 + weight_data_flattened.w *= weight_data_flattened.elempack; + weight_data_flattened.elemsize /= weight_data_flattened.elempack; + weight_data_flattened.elempack = 1; + + Mat bias_data_flattened; + if (bias_term) + { + const Mat& _bias_data = bottom_blobs[2]; + flatten(_bias_data, bias_data_flattened, opt); + if (bias_data_flattened.empty()) + return -100; + + // bias_data_flattened as pack1 + bias_data_flattened.w *= bias_data_flattened.elempack; + bias_data_flattened.elemsize /= bias_data_flattened.elempack; + bias_data_flattened.elempack = 1; + } + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise); + + ncnn::ParamDict pd; + pd.set(0, _num_output); + pd.set(1, _kernel_w); + pd.set(11, _kernel_h); + pd.set(2, dilation_w); + pd.set(12, dilation_h); + pd.set(3, stride_w); + pd.set(13, stride_h); + pd.set(4, pad_left); + pd.set(15, pad_right); + pd.set(14, pad_top); + pd.set(16, pad_bottom); + pd.set(18, pad_value); + pd.set(5, bias_term); + pd.set(6, weight_data_flattened.w); + pd.set(7, group); + pd.set(8, int8_scale_term); + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + ncnn::Mat weights[2]; + weights[0] = weight_data_flattened; + weights[1] = bias_data_flattened; + + op->load_model(ncnn::ModelBinFromMatArray(weights)); + + op->create_pipeline(opt); + + op->forward(bottom_blob, top_blob, opt); + + op->destroy_pipeline(opt); + + delete op; + + return 0; +} + +#if NCNN_INT8 +int ConvolutionDepthWise_loongarch::create_pipeline_int8_loongarch(const Option& opt) +{ + const int maxk = kernel_w * kernel_h; + int channels = (weight_data_size / group) / maxk / (num_output / group) * group; + + // depth-wise + if (channels == group && group == num_output) + { + int elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + elempack = channels % 8 == 0 ? 8 : 1; + } +#endif // __loongarch_sx + + if (elempack == 8) + { + Mat weight_data_r2 = weight_data.reshape(maxk, group); + convert_packing(weight_data_r2, weight_data_tm, 8, opt); + } + + if (elempack == 1) + { + weight_data_tm = weight_data; + } + + return 0; + } + + // group convolution + create_group_ops(opt); + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int ConvolutionDepthWise_loongarch::forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int elempack = bottom_blob.elempack; + + int elembits = bottom_blob.elembits(); + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + Mat bottom_blob_int8 = bottom_blob; + if (elembits != 8) + { + const int channels_g = channels * elempack / group; + + Mat scales(channels * elempack); + { + float* ps = scales; + for (int g = 0; g < group; g++) + { + float scale = bottom_blob_int8_scales[g]; + for (int q = 0; q < channels_g; q++) + { + *ps++ = scale; + } + } + } + + Option opt_q = opt; + opt_q.blob_allocator = opt.workspace_allocator; + quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q); + } + + Mat bottom_blob_bordered; + make_padding(bottom_blob_int8, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + channels = bottom_blob_bordered.c; + elempack = bottom_blob_bordered.elempack; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + + // depth-wise + if (channels * elempack == group && group == num_output) + { + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 8 == 0 ? 8 : 1; + } +#endif // __loongarch_sx + bool use_int8_requantize = int8_scale_term > 100; + size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __loongarch_sx + if (elempack == 8) + { + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + signed char* outptr_s8 = top_blob.channel(g); + float* outptr_f32 = top_blob.channel(g); + const signed char* kptr = (const signed char*)weight_data_tm + maxk * g * 8; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + const signed char* sptr = m.row(i * stride_h) + j * stride_w * 8; + + for (int k = 0; k < maxk; k++) + { + __m128i _val = __lsx_vld(sptr + space_ofs[k] * 8, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _w = __lsx_vld(kptr + k * 8, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val16, _w16); + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + + _sum0 = __lsx_vadd_w(_sum0, _s0l); + _sum1 = __lsx_vadd_w(_sum1, _s0h); + } + + __m128 _scale_in0; + __m128 _scale_in1; + { + __m128 _bottom_blob_int8_scales0 = (__m128)__lsx_vld((const float*)bottom_blob_int8_scales + g * 8, 0); + __m128 _bottom_blob_int8_scales1 = (__m128)__lsx_vld((const float*)bottom_blob_int8_scales + g * 8 + 4, 0); + __m128 _weight_data_int8_scales0 = (__m128)__lsx_vld((const float*)weight_data_int8_scales + g * 8, 0); + __m128 _weight_data_int8_scales1 = (__m128)__lsx_vld((const float*)weight_data_int8_scales + g * 8 + 4, 0); + _scale_in0 = __lsx_vfrecip_s(__lsx_vfmul_s(_bottom_blob_int8_scales0, _weight_data_int8_scales0)); + _scale_in1 = __lsx_vfrecip_s(__lsx_vfmul_s(_bottom_blob_int8_scales1, _weight_data_int8_scales1)); + + __m128i _m0 = __lsx_vfcmp_cne_s(_weight_data_int8_scales0, __lsx_vreplfr2vr_s(0.f)); + __m128i _m1 = __lsx_vfcmp_cne_s(_weight_data_int8_scales1, __lsx_vreplfr2vr_s(0.f)); + _scale_in0 = (__m128)__lsx_vand_v((__m128i)_scale_in0, (__m128i)_m0); + _scale_in1 = (__m128)__lsx_vand_v((__m128i)_scale_in1, (__m128i)_m1); + } + + __m128 _sumfp32_0 = __lsx_vfmul_s(__lsx_vffint_s_w(_sum0), _scale_in0); + __m128 _sumfp32_1 = __lsx_vfmul_s(__lsx_vffint_s_w(_sum1), _scale_in1); + + if (bias_term) + { + __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + g * 8, 0); + __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + g * 8 + 4, 0); + _sumfp32_0 = __lsx_vfadd_s(_sumfp32_0, _bias0); + _sumfp32_1 = __lsx_vfadd_s(_sumfp32_1, _bias1); + } + + _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params); + _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params); + + if (use_int8_requantize) + { + // requantize and relu + __m128 _scale_out0 = (__m128)__lsx_vld((const float*)top_blob_int8_scales + g * 8, 0); + __m128 _scale_out1 = (__m128)__lsx_vld((const float*)top_blob_int8_scales + g * 8 + 4, 0); + _sumfp32_0 = __lsx_vfmul_s(_sumfp32_0, _scale_out0); + _sumfp32_1 = __lsx_vfmul_s(_sumfp32_1, _scale_out1); + int64_t _sum8 = float2int8(_sumfp32_0, _sumfp32_1); + + *(int64_t*)outptr_s8 = _sum8; + outptr_s8 += 8; + } + else + { + // dequantize and relu + __lsx_vst(_sumfp32_0, outptr_f32, 0); + __lsx_vst(_sumfp32_1, outptr_f32 + 4, 0); + outptr_f32 += 8; + } + } + } + } + } + } +#endif // __loongarch_sx + + if (elempack == 1) + { + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + signed char* outptr_s8 = top_blob.channel(g); + float* outptr_f32 = top_blob.channel(g); + const signed char* kptr = (const signed char*)weight_data_tm + maxk * g; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + int sum = 0; + + const signed char* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + signed char val = sptr[space_ofs[k]]; + signed char w = kptr[k]; + sum += val * w; + } + + float scale_in; + if (weight_data_int8_scales[g] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + float sumfp32 = sum * scale_in; + + if (bias_term) + sumfp32 += bias_data[g]; + + sumfp32 = activation_ss(sumfp32, activation_type, activation_params); + + if (use_int8_requantize) + { + // requantize + float scale_out = top_blob_int8_scales[g]; + signed char sums8 = float2int8(sumfp32 * scale_out); + outptr_s8[0] = sums8; + outptr_s8 += 1; + } + else + { + // dequantize + outptr_f32[0] = sumfp32; + outptr_f32 += 1; + } + } + } + } + } + } + + return 0; + } + + bool use_int8_requantize = int8_scale_term > 100; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + if (use_int8_requantize) + out_elempack = num_output % 8 == 0 ? 8 : 1; + else + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif // __loongarch_sx + size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // group convolution + const int channels_g = channels * elempack / group; + const int num_output_g = num_output / group; + + int g_elempack = 1; + int out_g_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + g_elempack = channels_g % 8 == 0 ? 8 : 1; + if (use_int8_requantize) + out_g_elempack = num_output_g % 8 == 0 ? 8 : 1; + else + out_g_elempack = num_output_g % 4 == 0 ? 4 : 1; + } +#endif // __loongarch_sx + + // unpacking + Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; + if (elempack > g_elempack) + { + Option opt_p = opt; + opt_p.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p); + } + + Mat top_blob_unpacked = top_blob; + if (out_g_elempack < out_elempack) + { + top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator); + if (top_blob_unpacked.empty()) + return -100; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); + Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack); + + const ncnn::Layer* op = group_ops[g]; + + Option opt_g = opt; + opt_g.blob_allocator = top_blob_unpacked.allocator; + + // forward + op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + } + + // packing + if (out_g_elempack < out_elempack) + { + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + } + else + { + top_blob = top_blob_unpacked; + } + + return 0; +} +#endif // NCNN_INT8 + +} // namespace ncnn diff --git a/src/layer/loongarch/convolutiondepthwise_loongarch.h b/src/layer/loongarch/convolutiondepthwise_loongarch.h new file mode 100644 index 000000000000..554fe7643049 --- /dev/null +++ b/src/layer/loongarch/convolutiondepthwise_loongarch.h @@ -0,0 +1,50 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_CONVOLUTIONDEPTHWISE_LOONGARCH_H +#define LAYER_CONVOLUTIONDEPTHWISE_LOONGARCH_H + +#include "convolutiondepthwise.h" + +namespace ncnn { + +class ConvolutionDepthWise_loongarch : virtual public ConvolutionDepthWise +{ +public: + ConvolutionDepthWise_loongarch(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + +protected: + int create_group_ops(const Option& opt); +#if NCNN_INT8 + int create_pipeline_int8_loongarch(const Option& opt); + int forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif + +public: + Layer* activation; + std::vector group_ops; + + Mat weight_data_tm; +}; + +} // namespace ncnn + +#endif // LAYER_CONVOLUTIONDEPTHWISE_LOONGARCH_H diff --git a/src/layer/loongarch/crop_loongarch.cpp b/src/layer/loongarch/crop_loongarch.cpp new file mode 100644 index 000000000000..e7c588bc4760 --- /dev/null +++ b/src/layer/loongarch/crop_loongarch.cpp @@ -0,0 +1,399 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "crop_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +namespace ncnn { + +Crop_loongarch::Crop_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +#if __loongarch_sx +static void crop_pack4_lsx(const Mat& src, Mat& dst, int top, int left) +{ + int w = dst.w; + int h = dst.h; + int right = src.w - dst.w - left; + + const float* ptr = src.row(top) + left * 4; + float* outptr = dst; + + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + outptr += 4; + } + + ptr += (left + right) * 4; + } +} +#endif // __loongarch_sx + +int Crop_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + +#if __loongarch_sx + if (elempack == 4) + { + int _woffset, _hoffset, _doffset, _coffset; + int _outw, _outh, _outd, _outc; + resolve_crop_roi(bottom_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc); + + if (dims == 1) + { + int out_elempack = _outw % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (_outw / out_elempack == w && out_elempack == 4) + { + top_blob = bottom_blob; + return 0; + } + + if (_woffset % 4 == 0 && out_elempack == 4) + { + top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + crop_pack4_lsx(bottom_blob, top_blob, 0, _woffset / elempack); + + return 0; + } + } + + if (dims == 2) + { + int out_elempack = _outh % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (_outw == w && _outh / out_elempack == h && out_elempack == 4) + { + top_blob = bottom_blob; + return 0; + } + + if (_hoffset % 4 == 0 && out_elempack == 4) + { + top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + crop_pack4_lsx(bottom_blob, top_blob, _hoffset / elempack, _woffset); + + return 0; + } + } + + if (dims == 3) + { + int out_elempack = _outc % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 4) + { + top_blob = bottom_blob; + return 0; + } + + if (_coffset % 4 == 0 && out_elempack == 4) + { + const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack); + + if (_outw == w && _outh == h) + { + top_blob = bottom_blob_sliced.clone(); + if (top_blob.empty()) + return -100; + } + + top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < top_blob.c; q++) + { + const Mat m = bottom_blob_sliced.channel(q); + Mat borderm = top_blob.channel(q); + + crop_pack4_lsx(m, borderm, _hoffset, _woffset); + } + + return 0; + } + } + + if (dims == 4) + { + int out_elempack = _outc % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 4) + { + top_blob = bottom_blob; + return 0; + } + + if (_coffset % 4 == 0 && out_elempack == 4) + { + const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack); + + if (_outw == w && _outh == h && _outd == d) + { + top_blob = bottom_blob_sliced.clone(); + if (top_blob.empty()) + return -100; + } + + top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < top_blob.c; q++) + { + for (int z = 0; z < _outd; z++) + { + const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset); + Mat borderm = top_blob.channel(q).depth(z); + + crop_pack4_lsx(m, borderm, _hoffset, _woffset); + } + } + + return 0; + } + } + } +#endif // __loongarch_sx + + Mat bottom_blob_unpacked = bottom_blob; + if (elempack != 1) + { + Option opt_pack1 = opt; + opt_pack1.blob_allocator = opt.workspace_allocator; + + convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); + } + + return Crop::forward(bottom_blob_unpacked, top_blob, opt); +} + +int Crop_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& reference_blob = bottom_blobs[1]; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int ref_elempack = reference_blob.elempack; + + Mat& top_blob = top_blobs[0]; + +#if __loongarch_sx + if (elempack == 4) + { + int _woffset, _hoffset, _doffset, _coffset; + int _outw, _outh, _outd, _outc; + if (woffset == -233) + { + resolve_crop_roi(bottom_blob.shape(), (const int*)reference_blob, _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc); + } + else + { + resolve_crop_roi(bottom_blob.shape(), reference_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc); + } + + if (dims == 1) + { + int out_elempack = _outw % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (_outw / out_elempack == w && out_elempack == 4) + { + top_blob = bottom_blob; + return 0; + } + + if (_woffset % 4 == 0 && out_elempack == 4) + { + top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + crop_pack4_lsx(bottom_blob, top_blob, 0, _woffset / elempack); + + return 0; + } + } + + if (dims == 2) + { + int out_elempack = _outh % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (_outw == w && _outh / out_elempack == h && out_elempack == 4) + { + top_blob = bottom_blob; + return 0; + } + + if (_hoffset % 4 == 0 && out_elempack == 4) + { + top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + crop_pack4_lsx(bottom_blob, top_blob, _hoffset / elempack, _woffset); + + return 0; + } + } + + if (dims == 3) + { + int out_elempack = _outc % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 4) + { + top_blob = bottom_blob; + return 0; + } + + if (_coffset % 4 == 0 && out_elempack == 4) + { + const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack); + + if (_outw == w && _outh == h) + { + top_blob = bottom_blob_sliced.clone(); + if (top_blob.empty()) + return -100; + } + + top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < top_blob.c; q++) + { + const Mat m = bottom_blob_sliced.channel(q); + Mat borderm = top_blob.channel(q); + + crop_pack4_lsx(m, borderm, _hoffset, _woffset); + } + + return 0; + } + } + + if (dims == 4) + { + int out_elempack = _outc % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 4) + { + top_blob = bottom_blob; + return 0; + } + + if (_coffset % 4 == 0 && out_elempack == 4) + { + const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack); + + if (_outw == w && _outh == h && _outd == d) + { + top_blob = bottom_blob_sliced.clone(); + if (top_blob.empty()) + return -100; + } + + top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < top_blob.c; q++) + { + for (int z = 0; z < _outd; z++) + { + const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset); + Mat borderm = top_blob.channel(q).depth(z); + + crop_pack4_lsx(m, borderm, _hoffset, _woffset); + } + } + + return 0; + } + } + } +#endif // __loongarch_sx + + Mat bottom_blob_unpacked = bottom_blob; + if (elempack != 1) + { + Option opt_pack1 = opt; + opt_pack1.blob_allocator = opt.workspace_allocator; + + convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); + } + + Mat reference_blob_unpacked = reference_blob; + if (ref_elempack != 1) + { + Option opt_pack1 = opt; + opt_pack1.blob_allocator = opt.workspace_allocator; + + convert_packing(reference_blob, reference_blob_unpacked, 1, opt_pack1); + } + + std::vector bottom_blobs_unpacked(2); + bottom_blobs_unpacked[0] = bottom_blob_unpacked; + bottom_blobs_unpacked[1] = reference_blob_unpacked; + + return Crop::forward(bottom_blobs_unpacked, top_blobs, opt); +} + +} // namespace ncnn diff --git a/src/layer/loongarch/crop_loongarch.h b/src/layer/loongarch/crop_loongarch.h new file mode 100644 index 000000000000..0ba460256d6a --- /dev/null +++ b/src/layer/loongarch/crop_loongarch.h @@ -0,0 +1,34 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_CROP_LOONGARCH_H +#define LAYER_CROP_LOONGARCH_H + +#include "crop.h" + +namespace ncnn { + +class Crop_loongarch : virtual public Crop +{ +public: + Crop_loongarch(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_CROP_LOONGARCH_H diff --git a/src/layer/loongarch/deconvolution_loongarch.cpp b/src/layer/loongarch/deconvolution_loongarch.cpp new file mode 100644 index 000000000000..bb913909b551 --- /dev/null +++ b/src/layer/loongarch/deconvolution_loongarch.cpp @@ -0,0 +1,284 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "deconvolution_loongarch.h" + +#include "layer_type.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_activation.h" +#include "loongarch_usability.h" + +namespace ncnn { + +#if __loongarch_sx +#include "deconvolution_pack4.h" +#include "deconvolution_pack1to4.h" +#include "deconvolution_pack4to1.h" +#endif // __loongarch_sx + +Deconvolution_loongarch::Deconvolution_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Deconvolution_loongarch::create_pipeline(const Option& opt) +{ + const int maxk = kernel_w * kernel_h; + int num_input = weight_data_size / maxk / num_output; + + Mat weight_data_transposed(weight_data.w); + { + float* pt = weight_data_transposed; + const float* p = weight_data; + + for (int i = 0; i < num_input * num_output; i++) + { + for (int k = 0; k < maxk; k++) + { + pt[maxk - 1 - k] = p[k]; + } + + p += maxk; + pt += maxk; + } + } + + int elempack = 1; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + elempack = num_input % 4 == 0 ? 4 : 1; + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + + // src = kw-kh-inch-outch + // dst = pb-pa-kw-kh-inch/pa-outch/pb + { + Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output); + + weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + float* g00 = weight_data_tm.channel(q / out_elempack); + + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < elempack; i++) + { + for (int j = 0; j < out_elempack; j++) + { + const float* k00 = weight_data_r2.channel(q + j).row(p + i); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } + } + +#if __loongarch_sx + // pack4 + if (elempack == 4 && out_elempack == 4) + { + } + + // pack1ton + if (elempack == 1 && out_elempack == 4) + { + } + + // pack4to1 + if (elempack == 4 && out_elempack == 1) + { + } +#endif // __loongarch_sx + + // pack1 + if (elempack == 1 && out_elempack == 1) + { + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int Deconvolution_loongarch::destroy_pipeline(const Option& opt) +{ + return 0; +} + +int Deconvolution_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + // deconvolv with NxN kernel + // value = value + bias + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + // NCNN_LOGE("Deconvolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h); + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right; + int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat top_blob_bordered; + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0)) + { + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator); + } + else + { + top_blob_bordered = top_blob; + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + } + if (top_blob_bordered.empty()) + return -100; + + const int maxk = kernel_w * kernel_h; + +#if __loongarch_sx + if (elempack == 4 && out_elempack == 4) + { + { + deconvolution_pack4_lsx(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + if (elempack == 1 && out_elempack == 4) + { + { + deconvolution_pack1to4_lsx(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + if (elempack == 4 && out_elempack == 1) + { + { + deconvolution_pack4to1_lsx(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } +#endif // __loongarch_sx + + if (elempack == 1 && out_elempack == 1) + { + { + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output; p++) + { + float* outptr = top_blob_bordered.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + const float* kptr = (const float*)weight_data_tm.channel(p); + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + + for (int y = 0; y < kernel_h; y++) + { + int sys = (i + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= h) + continue; + + const float* sptr = m.row(sy); + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= w) + continue; + + float val = sptr[sx]; + + int k = y * kernel_w + x; + + float w = kptr[k]; + + sum += val * w; + } + } + + kptr += maxk; + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = sum; + } + + outptr += outw; + } + } + } + } + + cut_padding(top_blob_bordered, top_blob, opt); + if (top_blob.empty()) + return -100; + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/deconvolution_loongarch.h b/src/layer/loongarch/deconvolution_loongarch.h new file mode 100644 index 000000000000..bb7653b563fa --- /dev/null +++ b/src/layer/loongarch/deconvolution_loongarch.h @@ -0,0 +1,38 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_DECONVOLUTION_LOONGARCH_H +#define LAYER_DECONVOLUTION_LOONGARCH_H + +#include "deconvolution.h" + +namespace ncnn { + +class Deconvolution_loongarch : virtual public Deconvolution +{ +public: + Deconvolution_loongarch(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +public: + Mat weight_data_tm; +}; + +} // namespace ncnn + +#endif // LAYER_DECONVOLUTION_LOONGARCH_H diff --git a/src/layer/loongarch/deconvolution_pack1to4.h b/src/layer/loongarch/deconvolution_pack1to4.h new file mode 100644 index 000000000000..ee1f932b57a9 --- /dev/null +++ b/src/layer/loongarch/deconvolution_pack1to4.h @@ -0,0 +1,99 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deconvolution_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + float* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_data_ptr) + { + _sum = (__m128)__lsx_vld((const float*)bias_data_ptr + p * 4, 0); + } + + const float* kptr = (const float*)weight_data_pack1ton + maxk * channels * p * 4; + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + + for (int y = 0; y < kernel_h; y++) + { + int sys = (i + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= h) + continue; + + const float* sptr = m.row(sy); + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= w) + continue; + + float val = sptr[sx]; + + int k = y * kernel_w + x; + + __m128 _val = (__m128)__lsx_vreplfr2vr_s(val); + __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0); + _sum = __lsx_vfmadd_s(_w, _val, _sum); + } + } + + kptr += maxk * 4; + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } +} diff --git a/src/layer/loongarch/deconvolution_pack4.h b/src/layer/loongarch/deconvolution_pack4.h new file mode 100644 index 000000000000..179a410350fb --- /dev/null +++ b/src/layer/loongarch/deconvolution_pack4.h @@ -0,0 +1,106 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deconvolution_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + float* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_data_ptr) + { + _sum = (__m128)__lsx_vld((const float*)bias_data_ptr + p * 4, 0); + } + + const float* kptr = (const float*)weight_data_pack4.channel(p); + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + + for (int y = 0; y < kernel_h; y++) + { + int sys = (i + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= h) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= w) + continue; + + const float* sptr = m.row(sy) + sx * 4; + + int k = (y * kernel_w + x) * 16; + + __m128 _val0 = (__m128)__lsx_vreplfr2vr_s(*sptr++); + __m128 _val1 = (__m128)__lsx_vreplfr2vr_s(*sptr++); + __m128 _val2 = (__m128)__lsx_vreplfr2vr_s(*sptr++); + __m128 _val3 = (__m128)__lsx_vreplfr2vr_s(*sptr++); + __m128 _w0 = (__m128)__lsx_vld(kptr + k, 0); + __m128 _w1 = (__m128)__lsx_vld(kptr + k + 4, 0); + __m128 _w2 = (__m128)__lsx_vld(kptr + k + 8, 0); + __m128 _w3 = (__m128)__lsx_vld(kptr + k + 12, 0); + _sum = __lsx_vfmadd_s(_w0, _val0, _sum); + _sum = __lsx_vfmadd_s(_w1, _val1, _sum); + _sum = __lsx_vfmadd_s(_w2, _val2, _sum); + _sum = __lsx_vfmadd_s(_w3, _val3, _sum); + } + } + + kptr += maxk * 16; + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } +} diff --git a/src/layer/loongarch/deconvolution_pack4to1.h b/src/layer/loongarch/deconvolution_pack4to1.h new file mode 100644 index 000000000000..e13721c2c35d --- /dev/null +++ b/src/layer/loongarch/deconvolution_pack4to1.h @@ -0,0 +1,101 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deconvolution_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4to1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + float* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_data_ptr) + { + sum = bias_data_ptr[p]; + } + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + const float* kptr = (const float*)weight_data_pack4to1 + maxk * channels * p * 4; + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + + for (int y = 0; y < kernel_h; y++) + { + int sys = (i + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= h) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= w) + continue; + + const float* sptr = m.row(sy) + sx * 4; + + int k = y * kernel_w + x; + + __m128 _val = (__m128)__lsx_vld(sptr, 0); + __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0); + _sum = __lsx_vfmadd_s(_w, _val, _sum); + } + } + + kptr += maxk * 4; + } + + sum += __lsx_reduce_fadd_s(_sum); + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = sum; + } + + outptr += outw; + } + } +} diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp new file mode 100644 index 000000000000..a141dd703601 --- /dev/null +++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp @@ -0,0 +1,412 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "deconvolutiondepthwise_loongarch.h" + +#include "layer_type.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_activation.h" +#include "loongarch_usability.h" + +namespace ncnn { + +DeconvolutionDepthWise_loongarch::DeconvolutionDepthWise_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int DeconvolutionDepthWise_loongarch::create_pipeline(const Option& opt) +{ + const int maxk = kernel_w * kernel_h; + int channels = (weight_data_size / group) / maxk / (num_output / group) * group; + + // depth-wise + if (channels == group && group == num_output) + { + int elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + elempack = channels % 4 == 0 ? 4 : 1; + } +#endif + + Mat weight_data_transposed(weight_data.w); + { + float* pt = weight_data_transposed; + const float* p = weight_data; + + for (int i = 0; i < (channels / group) * (num_output / group) * group; i++) + { + for (int k = 0; k < maxk; k++) + { + pt[maxk - 1 - k] = p[k]; + } + + p += maxk; + pt += maxk; + } + } + +#if __loongarch_sx + // pack4 + if (elempack == 4) + { + Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group); + convert_packing(weight_data_r2, weight_data_tm, 4, opt); + } +#endif // __loongarch_sx + + if (elempack == 1) + { + weight_data_tm = weight_data_transposed; + } + + return 0; + } + + // group convolution + create_group_ops(opt); + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int DeconvolutionDepthWise_loongarch::create_group_ops(const Option& opt) +{ + // create Deconvolution op for each group + const int maxk = kernel_w * kernel_h; + int channels = (weight_data_size / group) / maxk / (num_output / group) * group; + + for (int i = 0; i < (int)group_ops.size(); i++) + delete group_ops[i]; + + group_ops.clear(); + + const int channels_g = channels / group; + const int num_output_g = num_output / group; + + group_ops.resize(group); + + for (int g = 0; g < group; g++) + { + Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone(); + Mat bias_data_g; + if (bias_term) + bias_data_g = bias_data.range(num_output_g * g, num_output_g); + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution); + + // set param + ncnn::ParamDict pd; + pd.set(0, num_output_g); // num_output + pd.set(1, kernel_w); + pd.set(11, kernel_h); + pd.set(2, dilation_w); + pd.set(12, dilation_h); + pd.set(3, stride_w); + pd.set(13, stride_h); + pd.set(4, 0); // pad_w + pd.set(14, 0); // pad_h + pd.set(18, output_pad_right); + pd.set(19, output_pad_bottom); + pd.set(5, bias_term); + pd.set(6, maxk * channels_g * num_output_g); // weight_data_size + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + // set weights + if (bias_term) + { + ncnn::Mat weights[2]; + weights[0] = weight_data_g; + weights[1] = bias_data_g; + + op->load_model(ModelBinFromMatArray(weights)); + } + else + { + ncnn::Mat weights[1]; + weights[0] = weight_data_g; + + op->load_model(ModelBinFromMatArray(weights)); + } + + op->create_pipeline(opt); + + group_ops[g] = op; + } + + return 0; +} + +int DeconvolutionDepthWise_loongarch::destroy_pipeline(const Option& opt) +{ + for (int i = 0; i < (int)group_ops.size(); i++) + { + group_ops[i]->destroy_pipeline(opt); + delete group_ops[i]; + } + group_ops.clear(); + + return 0; +} + +int DeconvolutionDepthWise_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + // convolv with NxN kernel + // value = value + bias + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right; + int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat top_blob_bordered; + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0)) + { + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator); + } + else + { + top_blob_bordered = top_blob; + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + } + if (top_blob_bordered.empty()) + return -100; + + const int maxk = kernel_w * kernel_h; + + // depth-wise + if (channels * elempack == group && group == num_output) + { +#if __loongarch_sx + if (elempack == 4) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + float* outptr = top_blob_bordered.channel(g); + const float* kptr = (const float*)weight_data_tm + maxk * g * 4; + const Mat m = bottom_blob.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum = (__m128)__lsx_vld((const float*)bias_data + g * 4, 0); + } + + for (int y = 0; y < kernel_h; y++) + { + int sys = (i + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= h) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= w) + continue; + + const float* sptr = m.row(sy) + sx * 4; + + int k = y * kernel_w + x; + + __m128 _val = (__m128)__lsx_vld(sptr, 0); + __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0); + _sum = __lsx_vfmadd_s(_w, _val, _sum); + } + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } + } + } +#endif // __loongarch_sx + + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + float* outptr = top_blob_bordered.channel(g); + const float* kptr = (const float*)weight_data_tm + maxk * g; + const Mat m = bottom_blob.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[g]; + } + + for (int y = 0; y < kernel_h; y++) + { + int sys = (i + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= h) + continue; + + const float* sptr = m.row(sy); + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= w) + continue; + + float val = sptr[sx]; + + int k = y * kernel_w + x; + + float w = kptr[k]; + + sum += val * w; + } + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = sum; + } + + outptr += outw; + } + } + } + } + else + { + // group deconvolution + const int channels_g = channels * elempack / group; + const int num_output_g = num_output / group; + + int g_elempack = 1; + int out_g_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + g_elempack = channels_g % 4 == 0 ? 4 : 1; + out_g_elempack = num_output_g % 4 == 0 ? 4 : 1; + } +#endif + + // unpacking + Mat bottom_blob_unpacked = bottom_blob; + if (elempack > g_elempack) + { + Option opt_p = opt; + opt_p.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p); + } + + Mat top_blob_bordered_unpacked = top_blob_bordered; + if (out_g_elempack < out_elempack) + { + top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator); + if (top_blob_bordered_unpacked.empty()) + return -100; + } + + for (int g = 0; g < group; g++) + { + const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); + Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack); + + const ncnn::Layer* op = group_ops[g]; + + Option opt_g = opt; + opt_g.blob_allocator = top_blob_bordered_unpacked.allocator; + + // forward + op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); + } + + // packing + if (out_g_elempack < out_elempack) + { + convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt); + } + else + { + top_blob_bordered = top_blob_bordered_unpacked; + } + } + + cut_padding(top_blob_bordered, top_blob, opt); + if (top_blob.empty()) + return -100; + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.h b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h new file mode 100644 index 000000000000..e41e7cac9e18 --- /dev/null +++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h @@ -0,0 +1,43 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_DECONVOLUTIONDEPTHWISE_LOONGARCH_H +#define LAYER_DECONVOLUTIONDEPTHWISE_LOONGARCH_H + +#include "deconvolutiondepthwise.h" + +namespace ncnn { + +class DeconvolutionDepthWise_loongarch : virtual public DeconvolutionDepthWise +{ +public: + DeconvolutionDepthWise_loongarch(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +protected: + int create_group_ops(const Option& opt); + +public: + std::vector group_ops; + + Mat weight_data_tm; +}; + +} // namespace ncnn + +#endif // LAYER_DECONVOLUTIONDEPTHWISE_LOONGARCH_H diff --git a/src/layer/loongarch/dequantize_loongarch.cpp b/src/layer/loongarch/dequantize_loongarch.cpp new file mode 100644 index 000000000000..5ee9595f89f0 --- /dev/null +++ b/src/layer/loongarch/dequantize_loongarch.cpp @@ -0,0 +1,838 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "dequantize_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +Dequantize_loongarch::Dequantize_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int Dequantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + // assert bottom_blob.elembits() == 32 + + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + +#if __loongarch_sx + if (elempack == 8) + { + if (dims == 1) + { + int w = bottom_blob.w; + int outw = w * 2; + + top_blob.create(outw, (size_t)16u, 4, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __lsx_vst(_v, ptr, 0); + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + } + } + } + else + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __lsx_vst(_v, ptr, 0); + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); + __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int outh = h * 2; + + top_blob.create(w, outh, (size_t)16u, 4, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr0 = top_blob.row(i * 2); + float* ptr1 = top_blob.row(i * 2 + 1); + + __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8, 0); + __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8 + 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + __lsx_vst(_v0, ptr0, 0); + __lsx_vst(_v1, ptr1, 0); + + intptr += 8; + ptr0 += 4; + ptr1 += 4; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr0 = top_blob.row(i * 2); + float* ptr1 = top_blob.row(i * 2 + 1); + + __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8, 0); + __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + __lsx_vst(_v0, ptr0, 0); + __lsx_vst(_v1, ptr1, 0); + + intptr += 8; + ptr0 += 4; + ptr1 += 4; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int outc = channels * 2; + + top_blob.create(w, h, outc, (size_t)16u, 4, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr0 = top_blob.channel(q * 2); + float* ptr1 = top_blob.channel(q * 2 + 1); + + __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8, 0); + __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8 + 4, 0); + + int i = 0; + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr + 64); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + _v2 = __lsx_vfmul_s(_v2, _scale0); + _v3 = __lsx_vfmul_s(_v3, _scale1); + __lsx_vst(_v0, ptr0, 0); + __lsx_vst(_v2, ptr0 + 4, 0); + __lsx_vst(_v1, ptr1, 0); + __lsx_vst(_v3, ptr1 + 4, 0); + + intptr += 16; + ptr0 += 8; + ptr1 += 8; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + __lsx_vst(_v0, ptr0, 0); + __lsx_vst(_v1, ptr1, 0); + + intptr += 8; + ptr0 += 4; + ptr1 += 4; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr0 = top_blob.channel(q * 2); + float* ptr1 = top_blob.channel(q * 2 + 1); + + __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8, 0); + __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); + + int i = 0; + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr + 64); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0); + _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1); + __lsx_vst(_v0, ptr0, 0); + __lsx_vst(_v2, ptr0 + 4, 0); + __lsx_vst(_v1, ptr1, 0); + __lsx_vst(_v3, ptr1 + 4, 0); + + intptr += 16; + ptr0 += 8; + ptr1 += 8; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + __lsx_vst(_v0, ptr0, 0); + __lsx_vst(_v1, ptr1, 0); + + intptr += 8; + ptr0 += 4; + ptr1 += 4; + } + } + } + } + + return 0; + } + + if (elempack == 4) + { + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)16u, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __lsx_vst(_v, ptr, 0); + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + } + } + } + else + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __lsx_vst(_v, ptr, 0); + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); + __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)16u, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __lsx_vst(_v, ptr, 0); + + intptr += 4; + ptr += 4; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); + __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + + intptr += 4; + ptr += 4; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)16u, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + + __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 4, 0); + + int i = 0; + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale); + _v1 = __lsx_vfmul_s(_v1, _scale); + __lsx_vst(_v0, ptr, 0); + __lsx_vst(_v1, ptr + 4, 0); + + intptr += 8; + ptr += 8; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __lsx_vst(_v, ptr, 0); + + intptr += 4; + ptr += 4; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + + __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 4, 0); + __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0); + + int i = 0; + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale, _v0, _bias); + _v1 = __lsx_vfmadd_s(_scale, _v1, _bias); + __lsx_vst(_v0, ptr, 0); + __lsx_vst(_v1, ptr + 4, 0); + + intptr += 8; + ptr += 8; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + + intptr += 4; + ptr += 4; + } + } + } + } + + return 0; + } +#endif // __loongarch_sx + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int* intptr = bottom_blob; + float* ptr = top_blob; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale; + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale + bias; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale + bias_data[i]; + } + } + } + else + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale_data[i]; + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale_data[i] + bias; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale_data[i] + bias_data[i]; + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + + int j = 0; +#if __loongarch_sx + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); + for (; j + 3 < w; j += 4) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __lsx_vst(_v, ptr, 0); + + intptr += 4; + ptr += 4; + } +#endif // __loongarch_sx + for (; j < w; j++) + { + *ptr++ = *intptr++ * scale; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; + + int j = 0; +#if __loongarch_sx + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias); + for (; j + 3 < w; j += 4) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + + intptr += 4; + ptr += 4; + } +#endif // __loongarch_sx + for (; j < w; j++) + { + *ptr++ = *intptr++ * scale + bias; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + + int i = 0; +#if __loongarch_sx + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale); + _v1 = __lsx_vfmul_s(_v1, _scale); + __lsx_vst(_v0, ptr, 0); + __lsx_vst(_v1, ptr + 4, 0); + + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __lsx_vst(_v, ptr, 0); + + intptr += 4; + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr++ = *intptr++ * scale; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; + + int i = 0; +#if __loongarch_sx + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias); + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale, _v0, _bias); + _v1 = __lsx_vfmadd_s(_scale, _v1, _bias); + __lsx_vst(_v0, ptr, 0); + __lsx_vst(_v1, ptr + 4, 0); + + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + + intptr += 4; + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr++ = *intptr++ * scale + bias; + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/dequantize_loongarch.h b/src/layer/loongarch/dequantize_loongarch.h new file mode 100644 index 000000000000..61a408d5c505 --- /dev/null +++ b/src/layer/loongarch/dequantize_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_DEQUANTIZE_LOONGARCH_H +#define LAYER_DEQUANTIZE_LOONGARCH_H + +#include "dequantize.h" + +namespace ncnn { + +class Dequantize_loongarch : virtual public Dequantize +{ +public: + Dequantize_loongarch(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_DEQUANTIZE_LOONGARCH_H diff --git a/src/layer/loongarch/dropout_loongarch.cpp b/src/layer/loongarch/dropout_loongarch.cpp new file mode 100644 index 000000000000..04a1f9ea95d8 --- /dev/null +++ b/src/layer/loongarch/dropout_loongarch.cpp @@ -0,0 +1,75 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "dropout_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +Dropout_loongarch::Dropout_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Dropout_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + if (scale == 1.f) + { + return 0; + } + + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmul_s(_p, _scale); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = *ptr * scale; + + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/dropout_loongarch.h b/src/layer/loongarch/dropout_loongarch.h new file mode 100644 index 000000000000..42810050677a --- /dev/null +++ b/src/layer/loongarch/dropout_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_DROPOUT_LOONGARCH_H +#define LAYER_DROPOUT_LOONGARCH_H + +#include "dropout.h" + +namespace ncnn { + +class Dropout_loongarch : virtual public Dropout +{ +public: + Dropout_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_DROPOUT_LOONGARCH_H diff --git a/src/layer/loongarch/eltwise_loongarch.cpp b/src/layer/loongarch/eltwise_loongarch.cpp new file mode 100644 index 000000000000..d803fc3db78e --- /dev/null +++ b/src/layer/loongarch/eltwise_loongarch.cpp @@ -0,0 +1,332 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "eltwise_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +Eltwise_loongarch::Eltwise_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Eltwise_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int elempack = bottom_blob.elempack; + int size = w * h * elempack; + + Mat& top_blob = top_blobs[0]; + top_blob.create_like(bottom_blob, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (op_type == Operation_PROD) + { + // first blob + const Mat& bottom_blob1 = bottom_blobs[1]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + const float* ptr1 = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + _p = __lsx_vfmul_s(_p, _p1); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + ptr1 += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = *ptr * *ptr1; + + ptr++; + ptr1++; + outptr++; + } + } + + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob1 = bottom_blobs[b]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __m128 _p = (__m128)__lsx_vld(outptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmul_s(_p, _p1); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr *= *ptr; + + ptr++; + outptr++; + } + } + } + } + if (op_type == Operation_SUM) + { + if (coeffs.w == 0) + { + // first blob + const Mat& bottom_blob1 = bottom_blobs[1]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + const float* ptr1 = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + _p = __lsx_vfadd_s(_p, _p1); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + ptr1 += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = *ptr + *ptr1; + + ptr++; + ptr1++; + outptr++; + } + } + + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob1 = bottom_blobs[b]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __m128 _p = (__m128)__lsx_vld(outptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfadd_s(_p, _p1); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr += *ptr; + + ptr++; + outptr++; + } + } + } + } + else + { + // first blob + const Mat& bottom_blob1 = bottom_blobs[1]; + float coeff0 = coeffs[0]; + float coeff1 = coeffs[1]; +#if __loongarch_sx + __m128 _coeff0 = (__m128)__lsx_vreplfr2vr_s(coeff0); + __m128 _coeff1 = (__m128)__lsx_vreplfr2vr_s(coeff1); +#endif // __loongarch_sx + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + const float* ptr1 = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + _p = __lsx_vfmul_s(_p, _coeff0); + _p = __lsx_vfmadd_s(_coeff1, _p1, _p); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + ptr1 += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = *ptr * coeff0 + *ptr1 * coeff1; + + ptr++; + ptr1++; + outptr++; + } + } + + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob1 = bottom_blobs[b]; + float coeff = coeffs[b]; +#if __loongarch_sx + __m128 _coeff = (__m128)__lsx_vreplfr2vr_s(coeff); +#endif // __loongarch_sx + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __m128 _p = (__m128)__lsx_vld(outptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmadd_s(_coeff, _p1, _p); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr += *ptr * coeff; + + ptr++; + outptr++; + } + } + } + } + } + if (op_type == Operation_MAX) + { + // first blob + const Mat& bottom_blob1 = bottom_blobs[1]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + const float* ptr1 = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + _p = __lsx_vfmax_s(_p, _p1); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + ptr1 += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = std::max(*ptr, *ptr1); + + ptr++; + ptr1++; + outptr++; + } + } + + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob1 = bottom_blobs[b]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __m128 _p = (__m128)__lsx_vld(outptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmax_s(_p, _p1); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = std::max(*ptr, *outptr); + + ptr++; + outptr++; + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/eltwise_loongarch.h b/src/layer/loongarch/eltwise_loongarch.h new file mode 100644 index 000000000000..f9715b20cadc --- /dev/null +++ b/src/layer/loongarch/eltwise_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_ELTWISE_LOONGARCH_H +#define LAYER_ELTWISE_LOONGARCH_H + +#include "eltwise.h" + +namespace ncnn { + +class Eltwise_loongarch : virtual public Eltwise +{ +public: + Eltwise_loongarch(); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_ELTWISE_LOONGARCH_H diff --git a/src/layer/loongarch/flatten_loongarch.cpp b/src/layer/loongarch/flatten_loongarch.cpp new file mode 100644 index 000000000000..6d9a86362873 --- /dev/null +++ b/src/layer/loongarch/flatten_loongarch.cpp @@ -0,0 +1,370 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "flatten_loongarch.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +namespace ncnn { + +Flatten_loongarch::Flatten_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Flatten_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int elembits = bottom_blob.elembits(); + + if (elembits == 8) + return forward_int8(bottom_blob, top_blob, opt); + + int dims = bottom_blob.dims; + + if (dims == 1) + { + top_blob = bottom_blob; + return 0; + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + int size = w * h * d; + + int total = size * channels * elempack; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = total % 4 == 0 ? 4 : 1; + } +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (out_elempack == 1) + { + return Flatten::forward(bottom_blob, top_blob, opt); + } + + if (dims == 2 && elempack == 1) // out_elempack == 4 + { + top_blob = bottom_blob; + top_blob.dims = 1; + top_blob.w = total / out_elempack; + top_blob.h = 1; + top_blob.cstep = top_blob.w; + top_blob.elemsize = out_elemsize; + top_blob.elempack = out_elempack; + return 0; + } + + top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (dims == 2) + { +#if __loongarch_sx + if (elempack == 4) // out_elempack == 4 + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr = bottom_blob.row(i); + float* outptr0 = (float*)top_blob + w * i * 4; + float* outptr1 = (float*)top_blob + w * (i * 4 + 1); + float* outptr2 = (float*)top_blob + w * (i * 4 + 2); + float* outptr3 = (float*)top_blob + w * (i * 4 + 3); + + int j = 0; + for (; j + 3 < w; j += 4) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(ptr, 0); + __m128i _r1 = __lsx_vld(ptr + 4, 0); + __m128i _r2 = __lsx_vld(ptr + 4 * 2, 0); + __m128i _r3 = __lsx_vld(ptr + 4 * 3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, outptr0, 0); + __lsx_vst(_r0123_1, outptr1, 0); + __lsx_vst(_r0123_2, outptr2, 0); + __lsx_vst(_r0123_3, outptr3, 0); + + ptr += 16; + outptr0 += 4; + outptr1 += 4; + outptr2 += 4; + outptr3 += 4; + } + for (; j < w; j++) + { + *outptr0++ = ptr[0]; + *outptr1++ = ptr[1]; + *outptr2++ = ptr[2]; + *outptr3++ = ptr[3]; + + ptr += 4; + } + } + } +#endif // __loongarch_sx + } + + if (dims == 3 || dims == 4) + { +#if __loongarch_sx + if (elempack == 4) // out_elempack == 4 + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + float* outptr0 = (float*)top_blob + size * q * 4; + float* outptr1 = (float*)top_blob + size * (q * 4 + 1); + float* outptr2 = (float*)top_blob + size * (q * 4 + 2); + float* outptr3 = (float*)top_blob + size * (q * 4 + 3); + + int i = 0; + for (; i + 3 < size; i += 4) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(ptr, 0); + __m128i _r1 = __lsx_vld(ptr + 4, 0); + __m128i _r2 = __lsx_vld(ptr + 4 * 2, 0); + __m128i _r3 = __lsx_vld(ptr + 4 * 3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, outptr0, 0); + __lsx_vst(_r0123_1, outptr1, 0); + __lsx_vst(_r0123_2, outptr2, 0); + __lsx_vst(_r0123_3, outptr3, 0); + + ptr += 16; + outptr0 += 4; + outptr1 += 4; + outptr2 += 4; + outptr3 += 4; + } + for (; i < size; i++) + { + *outptr0++ = ptr[0]; + *outptr1++ = ptr[1]; + *outptr2++ = ptr[2]; + *outptr3++ = ptr[3]; + + ptr += 4; + } + } + } +#endif // __loongarch_sx + + if (elempack == 1) // out_elempack == 4 + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + float* outptr = (float*)top_blob + size * q; + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __lsx_vst(__lsx_vld(ptr, 0), outptr, 0); + ptr += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr++ = *ptr++; + } + } + } + } + + return 0; +} + +int Flatten_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int dims = bottom_blob.dims; + + if (dims == 1) + { + top_blob = bottom_blob; + return 0; + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + int size = w * h * d; + + int total = size * channels * elempack; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = total % 8 == 0 ? 8 : 1; + } +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (out_elempack == 1) + { + return Flatten::forward(bottom_blob, top_blob, opt); + } + + if (dims == 2 && elempack == 1) // out_elempack == 8 + { + top_blob = bottom_blob; + top_blob.dims = 1; + top_blob.w = total / out_elempack; + top_blob.h = 1; + top_blob.cstep = top_blob.w; + top_blob.elemsize = out_elemsize; + top_blob.elempack = out_elempack; + return 0; + } + + top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (dims == 2) + { +#if __loongarch_sx + if (elempack == 8) // out_elempack == 8 + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const signed char* ptr = bottom_blob.row(i); + signed char* outptr0 = (signed char*)top_blob + w * i * 8; + signed char* outptr1 = (signed char*)top_blob + w * (i * 8 + 1); + signed char* outptr2 = (signed char*)top_blob + w * (i * 8 + 2); + signed char* outptr3 = (signed char*)top_blob + w * (i * 8 + 3); + signed char* outptr4 = (signed char*)top_blob + w * (i * 8 + 4); + signed char* outptr5 = (signed char*)top_blob + w * (i * 8 + 5); + signed char* outptr6 = (signed char*)top_blob + w * (i * 8 + 6); + signed char* outptr7 = (signed char*)top_blob + w * (i * 8 + 7); + + int j = 0; + for (; j < w; j++) + { + *outptr0++ = ptr[0]; + *outptr1++ = ptr[1]; + *outptr2++ = ptr[2]; + *outptr3++ = ptr[3]; + *outptr4++ = ptr[4]; + *outptr5++ = ptr[5]; + *outptr6++ = ptr[6]; + *outptr7++ = ptr[7]; + + ptr += 8; + } + } + } +#endif // __loongarch_sx + } + + if (dims == 3 || dims == 4) + { +#if __loongarch_sx + if (elempack == 8) // out_elempack == 8 + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const signed char* ptr = bottom_blob.channel(q); + signed char* outptr0 = (signed char*)top_blob + size * q * 8; + signed char* outptr1 = (signed char*)top_blob + size * (q * 8 + 1); + signed char* outptr2 = (signed char*)top_blob + size * (q * 8 + 2); + signed char* outptr3 = (signed char*)top_blob + size * (q * 8 + 3); + signed char* outptr4 = (signed char*)top_blob + size * (q * 8 + 4); + signed char* outptr5 = (signed char*)top_blob + size * (q * 8 + 5); + signed char* outptr6 = (signed char*)top_blob + size * (q * 8 + 6); + signed char* outptr7 = (signed char*)top_blob + size * (q * 8 + 7); + + int i = 0; + for (; i < size; i++) + { + *outptr0++ = ptr[0]; + *outptr1++ = ptr[1]; + *outptr2++ = ptr[2]; + *outptr3++ = ptr[3]; + *outptr4++ = ptr[4]; + *outptr5++ = ptr[5]; + *outptr6++ = ptr[6]; + *outptr7++ = ptr[7]; + + ptr += 8; + } + } + } +#endif // __loongarch_sx + + if (elempack == 1) // out_elempack == 8 + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const signed char* ptr = bottom_blob.channel(q); + signed char* outptr = (signed char*)top_blob + size * q; + + int i = 0; + for (; i < size; i++) + { + *outptr++ = *ptr++; + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/flatten_loongarch.h b/src/layer/loongarch/flatten_loongarch.h new file mode 100644 index 000000000000..afd35c701f59 --- /dev/null +++ b/src/layer/loongarch/flatten_loongarch.h @@ -0,0 +1,35 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_FLATTEN_LOONGARCH_H +#define LAYER_FLATTEN_LOONGARCH_H + +#include "flatten.h" + +namespace ncnn { + +class Flatten_loongarch : virtual public Flatten +{ +public: + Flatten_loongarch(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +protected: + int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_FLATTEN_LOONGARCH_H diff --git a/src/layer/loongarch/hardsigmoid_loongarch.cpp b/src/layer/loongarch/hardsigmoid_loongarch.cpp new file mode 100644 index 000000000000..9dfedb689bc5 --- /dev/null +++ b/src/layer/loongarch/hardsigmoid_loongarch.cpp @@ -0,0 +1,79 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "hardsigmoid_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +HardSigmoid_loongarch::HardSigmoid_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int HardSigmoid_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f); + __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(alpha); + __m128 _beta = (__m128)__lsx_vreplfr2vr_s(beta); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmadd_s(_alpha, _p, _beta); + _p = __lsx_vfmax_s(_p, _zero); + _p = __lsx_vfmin_s(_p, _one); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + if (*ptr < lower) + *ptr = 0.f; + else if (*ptr > upper) + *ptr = 1.f; + else + *ptr = *ptr * alpha + beta; + ++ptr; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/hardsigmoid_loongarch.h b/src/layer/loongarch/hardsigmoid_loongarch.h new file mode 100644 index 000000000000..755ae89ff03e --- /dev/null +++ b/src/layer/loongarch/hardsigmoid_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_HARDSIGMOID_LOONGARCH_H +#define LAYER_HARDSIGMOID_LOONGARCH_H + +#include "hardsigmoid.h" + +namespace ncnn { + +class HardSigmoid_loongarch : virtual public HardSigmoid +{ +public: + HardSigmoid_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_HARDSIGMOID_LOONGARCH_H diff --git a/src/layer/loongarch/hardswish_loongarch.cpp b/src/layer/loongarch/hardswish_loongarch.cpp new file mode 100644 index 000000000000..f1417a7986c9 --- /dev/null +++ b/src/layer/loongarch/hardswish_loongarch.cpp @@ -0,0 +1,80 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "hardswish_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +HardSwish_loongarch::HardSwish_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int HardSwish_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f); + __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(alpha); + __m128 _beta = (__m128)__lsx_vreplfr2vr_s(beta); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = __lsx_vfmadd_s(_alpha, _p, _beta); + _outp = __lsx_vfmax_s(_outp, _zero); + _outp = __lsx_vfmin_s(_outp, _one); + _outp = __lsx_vfmul_s(_outp, _p); + __lsx_vst(_outp, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + if (*ptr < lower) + *ptr = 0.f; + else if (*ptr > upper) + ; + else + *ptr = *ptr * (*ptr * alpha + beta); + ++ptr; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/hardswish_loongarch.h b/src/layer/loongarch/hardswish_loongarch.h new file mode 100644 index 000000000000..e9b0821245c3 --- /dev/null +++ b/src/layer/loongarch/hardswish_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_HARDSWISH_LOONGARCH_H +#define LAYER_HARDSWISH_LOONGARCH_H + +#include "hardswish.h" + +namespace ncnn { + +class HardSwish_loongarch : virtual public HardSwish +{ +public: + HardSwish_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_HARDSWISH_LOONGARCH_H diff --git a/src/layer/loongarch/innerproduct_loongarch.cpp b/src/layer/loongarch/innerproduct_loongarch.cpp new file mode 100644 index 000000000000..3dd6ff35e232 --- /dev/null +++ b/src/layer/loongarch/innerproduct_loongarch.cpp @@ -0,0 +1,1637 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "innerproduct_loongarch.h" + +#include "layer_type.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +#include "loongarch_activation.h" + +namespace ncnn { + +InnerProduct_loongarch::InnerProduct_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx + + flatten = 0; +} + +int InnerProduct_loongarch::create_pipeline(const Option& opt) +{ + { + flatten = ncnn::create_layer(ncnn::LayerType::Flatten); + + ncnn::ParamDict pd; + + flatten->load_param(pd); + + flatten->create_pipeline(opt); + } + +#if NCNN_INT8 + if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) + { + return create_pipeline_int8_loongarch(opt); + } +#endif + +#if __loongarch_sx + if (opt.use_fp16_storage) + { + return create_pipeline_fp16s(opt); + } +#endif + + const int num_input = weight_data_size / num_output; + + int out_elempack = 1; + +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif // __loongarch_sx + + if (out_elempack == 4) + { + // src = inch-outch + // dst = 4-inch-outch/4 + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + + weight_data_tm.create(num_input, num_output / 4, (size_t)4u * 4, 4); + + for (int q = 0; q + 3 < num_output; q += 4) + { + float* g0 = weight_data_tm.row(q / 4); + + for (int p = 0; p < num_input; p++) + { + for (int j = 0; j < 4; j++) + { + *g0++ = weight_data_r2.row(q + j)[p]; + } + } + } + } + } + else + { + weight_data_tm = weight_data; + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int InnerProduct_loongarch::destroy_pipeline(const Option& opt) +{ + if (flatten) + { + flatten->destroy_pipeline(opt); + delete flatten; + flatten = 0; + } + + return 0; +} + +int InnerProduct_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if NCNN_INT8 + if (opt.use_int8_inference && int8_scale_term) + { + return forward_int8_loongarch(bottom_blob, top_blob, opt); + } +#endif + +#if __loongarch_sx + if (opt.use_fp16_storage) + { + return forward_fp16s(bottom_blob, top_blob, opt); + } +#endif + + const int num_input = weight_data_size / num_output; + + if (bottom_blob.dims == 2 && bottom_blob.w == num_input && bottom_blob.h * bottom_blob.elempack > 1) + { + // gemm + int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int num_output_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + num_output_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < h; j++) + { +#if __loongarch_sx + if (elempack == 4 && num_output_elempack == 4) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const float* kptr = weight_data_tm.row(p); + const float* m = bottom_blob.row(j); + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum0 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 0]); + _sum1 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 1]); + _sum2 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 2]); + _sum3 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 3]); + } + + int i = 0; + for (; i < num_input; i++) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(kptr + 16); + __m128 _val = (__m128)__lsx_vld(m, 0); + __m128i _w = __lsx_vld(kptr, 0); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 0), _val, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 1), _val, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 2), _val, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 3), _val, _sum3); + + m += 4; + kptr += 4; + } + + _sum0 = activation_ps(_sum0, activation_type, activation_params); + _sum1 = activation_ps(_sum1, activation_type, activation_params); + _sum2 = activation_ps(_sum2, activation_type, activation_params); + _sum3 = activation_ps(_sum3, activation_type, activation_params); + + __lsx_vst(_sum0, outptr, 0); + __lsx_vst(_sum1, outptr + 4, 0); + __lsx_vst(_sum2, outptr + 8, 0); + __lsx_vst(_sum3, outptr + 12, 0); + outptr += 16; + } + } + + if (elempack == 1 && num_output_elempack == 4) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const float* kptr = weight_data_tm.row(p); + const float* m = bottom_blob.row(j); + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0); + } + + int i = 0; + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(kptr + 64); + __m128i _val = __lsx_vld(m, 0); + __m128 _w0 = (__m128)__lsx_vld(kptr, 0); + __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0); + __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3); + + m += 4; + kptr += 16; + } + for (; i < num_input; i++) + { + __m128 _val = __lsx_vreplfr2vr_s(m[0]); + __m128 _w = (__m128)__lsx_vld(kptr, 0); + _sum0 = __lsx_vfmadd_s(_w, _val, _sum0); + + m += 1; + kptr += 4; + } + + _sum0 = __lsx_vfadd_s(_sum0, _sum1); + _sum2 = __lsx_vfadd_s(_sum2, _sum3); + _sum0 = __lsx_vfadd_s(_sum0, _sum2); + + _sum0 = activation_ps(_sum0, activation_type, activation_params); + + __lsx_vst(_sum0, outptr, 0); + outptr += 4; + } + } + + if (elempack == 4 && num_output_elempack == 1) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output; p++) + { + const float* kptr = (const float*)weight_data_tm + num_input * p; + const float* m = bottom_blob.row(j); + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum = __lsx_vreplfr2vr_s(bias_data[p]); + } + + for (int i = 0; i < num_input; i++) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(kptr + 4); + __m128 _val = (__m128)__lsx_vld(m, 0); + __m128 _k = __lsx_vreplfr2vr_s(kptr[0]); + _sum = __lsx_vfmadd_s(_k, _val, _sum); + + m += 4; + kptr += 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr, 0); + outptr += 4; + } + } +#endif // __loongarch_sx + + if (elempack == 1 && num_output_elempack == 1) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output; p++) + { + const float* kptr = (const float*)weight_data_tm + num_input * p; + const float* m = bottom_blob.row(j); + + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + int i = 0; +#if __loongarch_sx + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(kptr + 16); + __m128 _m = (__m128)__lsx_vld(m, 0); + __m128 _w = (__m128)__lsx_vld(kptr, 0); + _sum = __lsx_vfmadd_s(_w, _m, _sum); + + m += 4; + kptr += 4; + } + sum += __lsx_reduce_fadd_s(_sum); +#endif // __loongarch_sx + for (; i < num_input; i++) + { + sum += *m * *kptr; + + m += 1; + kptr += 1; + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[0] = sum; + outptr += 1; + } + } + } + + return 0; + } + + // flatten + Mat bottom_blob_flattened = bottom_blob; + if (bottom_blob.dims != 1) + { + Option opt_flatten = opt; + opt_flatten.blob_allocator = opt.workspace_allocator; + + flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); + } + + size_t elemsize = bottom_blob_flattened.elemsize; + int elempack = bottom_blob_flattened.elempack; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif // __loongarch_sx + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __loongarch_sx + if (out_elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0); + } + + const float* kptr = weight_data_tm.row(p); + + const float* sptr = bottom_blob_flattened; + + int i = 0; + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(sptr + 16); + __builtin_prefetch(kptr + 64); + __m128i _val = __lsx_vld(sptr, 0); + __m128 _w0 = (__m128)__lsx_vld(kptr, 0); + __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0); + __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3); + + sptr += 4; + kptr += 16; + } + for (; i < num_input; i++) + { + __m128 _val = __lsx_vreplfr2vr_s(sptr[0]); + __m128 _w = (__m128)__lsx_vld(kptr, 0); + _sum0 = __lsx_vfmadd_s(_w, _val, _sum0); + + sptr += 1; + kptr += 4; + } + + _sum0 = __lsx_vfadd_s(_sum0, _sum1); + _sum2 = __lsx_vfadd_s(_sum2, _sum3); + _sum0 = __lsx_vfadd_s(_sum0, _sum2); + + _sum0 = activation_ps(_sum0, activation_type, activation_params); + + float* outptr = top_blob; + __lsx_vst(_sum0, outptr + p * 4, 0); + } + } +#endif // __loongarch_sx + + if (out_elempack == 1) + { + int nn_num_output = num_output / 4; + int remain_num_output_start = nn_num_output * 4; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_num_output; pp++) + { + int p = pp * 4; + + float sum0 = 0.f; + float sum1 = 0.f; + float sum2 = 0.f; + float sum3 = 0.f; + + if (bias_term) + { + sum0 = bias_data[p]; + sum1 = bias_data[p + 1]; + sum2 = bias_data[p + 2]; + sum3 = bias_data[p + 3]; + } + + const float* w0 = (const float*)weight_data_tm + num_input * p; + const float* w1 = (const float*)weight_data_tm + num_input * (p + 1); + const float* w2 = (const float*)weight_data_tm + num_input * (p + 2); + const float* w3 = (const float*)weight_data_tm + num_input * (p + 3); + + const float* m = bottom_blob_flattened; + + int i = 0; +#if __loongarch_sx + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(w0 + 16); + __builtin_prefetch(w1 + 16); + __builtin_prefetch(w2 + 16); + __builtin_prefetch(w3 + 16); + __m128 _m = (__m128)__lsx_vld(m, 0); + __m128 _w0 = (__m128)__lsx_vld(w0, 0); + __m128 _w1 = (__m128)__lsx_vld(w1, 0); + __m128 _w2 = (__m128)__lsx_vld(w2, 0); + __m128 _w3 = (__m128)__lsx_vld(w3, 0); + _sum0 = __lsx_vfmadd_s(_w0, _m, _sum0); + _sum1 = __lsx_vfmadd_s(_w1, _m, _sum1); + _sum2 = __lsx_vfmadd_s(_w2, _m, _sum2); + _sum3 = __lsx_vfmadd_s(_w3, _m, _sum3); + + m += 4; + w0 += 4; + w1 += 4; + w2 += 4; + w3 += 4; + } +#endif // __loongarch_sx + for (; i < num_input; i++) + { + sum0 += *m * *w0; + sum1 += *m * *w1; + sum2 += *m * *w2; + sum3 += *m * *w3; + + m++; + w0++; + w1++; + w2++; + w3++; + } + +#if __loongarch_sx + sum0 += __lsx_reduce_fadd_s(_sum0); + sum1 += __lsx_reduce_fadd_s(_sum1); + sum2 += __lsx_reduce_fadd_s(_sum2); + sum3 += __lsx_reduce_fadd_s(_sum3); +#endif // __loongarch_sx + + sum0 = activation_ss(sum0, activation_type, activation_params); + sum1 = activation_ss(sum1, activation_type, activation_params); + sum2 = activation_ss(sum2, activation_type, activation_params); + sum3 = activation_ss(sum3, activation_type, activation_params); + + top_blob[p] = sum0; + top_blob[p + 1] = sum1; + top_blob[p + 2] = sum2; + top_blob[p + 3] = sum3; + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_num_output_start; p < num_output; p++) + { + float sum = 0.f; + + if (bias_term) + sum = bias_data[p]; + + const float* w = (const float*)weight_data_tm + num_input * p; + + const float* m = bottom_blob_flattened; + + int i = 0; +#if __loongarch_sx + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(w + 16); + __m128 _m = (__m128)__lsx_vld(m, 0); + __m128 _w = (__m128)__lsx_vld(w, 0); + _sum0 = __lsx_vfmadd_s(_w, _m, _sum0); + + m += 4; + w += 4; + } + sum += __lsx_reduce_fadd_s(_sum0); +#endif // __loongarch_sx + for (; i < num_input; i++) + { + sum += *m * *w; + + m++; + w++; + } + + sum = activation_ss(sum, activation_type, activation_params); + + top_blob[p] = sum; + } + } + + return 0; +} + +#if __loongarch_sx +int InnerProduct_loongarch::create_pipeline_fp16s(const Option& opt) +{ + const int num_input = weight_data_size / num_output; + + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } + + // src = inch-outch + // dst = pb-inch-outch/pb + if (out_elempack == 4) + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + + weight_data_tm.create(num_input, num_output / 4, (size_t)8u, 4); + + for (int q = 0; q + 3 < num_output; q += 4) + { + unsigned short* g0 = weight_data_tm.row(q / 4); + + const float* k0 = weight_data_r2.row(q); + const float* k1 = weight_data_r2.row(q + 1); + const float* k2 = weight_data_r2.row(q + 2); + const float* k3 = weight_data_r2.row(q + 3); + + int p = 0; + for (; p + 3 < num_input; p += 4) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(k0, 0); + __m128i _r1 = __lsx_vld(k1, 0); + __m128i _r2 = __lsx_vld(k2, 0); + __m128i _r3 = __lsx_vld(k3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __m128i _p0 = __lsx_vfcvt_h_s((__m128)_r0123_1, (__m128)_r0123_0); + __m128i _p1 = __lsx_vfcvt_h_s((__m128)_r0123_3, (__m128)_r0123_2); + + __lsx_vst(_p0, g0, 0); + __lsx_vst(_p1, g0 + 8, 0); + + k0 += 4; + k1 += 4; + k2 += 4; + k3 += 4; + g0 += 16; + } + for (; p < num_input; p++) + { + g0[0] = float32_to_float16(*k0++); + g0[1] = float32_to_float16(*k1++); + g0[2] = float32_to_float16(*k2++); + g0[3] = float32_to_float16(*k3++); + g0 += 4; + } + } + } + + if (out_elempack == 1) + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt); + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int InnerProduct_loongarch::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int num_input = weight_data_size / num_output; + + if (bottom_blob.dims == 2 && bottom_blob.w == num_input && bottom_blob.h * bottom_blob.elempack > 1) + { + // gemm + int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int num_output_elempack = 1; + if (opt.use_packing_layout) + { + num_output_elempack = num_output % 4 == 0 ? 4 : 1; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < h; j++) + { + if (elempack == 4 && num_output_elempack == 4) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const unsigned short* kptr = weight_data_tm.row(p); + const float* m = bottom_blob.row(j); + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum0 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 0]); + _sum1 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 1]); + _sum2 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 2]); + _sum3 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 3]); + } + + int i = 0; + for (; i < num_input; i++) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(kptr + 16); + __m128 _val = (__m128)__lsx_vld(m, 0); + __m128i _w = (__m128i)__lsx_vfcvtl_s_h(__lsx_vld(kptr, 0)); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 0), _val, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 1), _val, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 2), _val, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 3), _val, _sum3); + + m += 4; + kptr += 4; + } + + _sum0 = activation_ps(_sum0, activation_type, activation_params); + _sum1 = activation_ps(_sum1, activation_type, activation_params); + _sum2 = activation_ps(_sum2, activation_type, activation_params); + _sum3 = activation_ps(_sum3, activation_type, activation_params); + + __lsx_vst(_sum0, outptr, 0); + __lsx_vst(_sum1, outptr + 4, 0); + __lsx_vst(_sum2, outptr + 8, 0); + __lsx_vst(_sum3, outptr + 12, 0); + outptr += 16; + } + } + + if (elempack == 1 && num_output_elempack == 4) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const unsigned short* kptr = weight_data_tm.row(p); + const float* m = bottom_blob.row(j); + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0); + } + + int i = 0; + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(kptr + 64); + __m128i _val = __lsx_vld(m, 0); + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _w23 = __lsx_vld(kptr + 8, 0); + __m128 _w0 = __lsx_vfcvtl_s_h(_w01); + __m128 _w1 = __lsx_vfcvth_s_h(_w01); + __m128 _w2 = __lsx_vfcvtl_s_h(_w23); + __m128 _w3 = __lsx_vfcvth_s_h(_w23); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3); + + m += 4; + kptr += 16; + } + for (; i < num_input; i++) + { + __m128 _val = __lsx_vreplfr2vr_s(m[0]); + __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(kptr, 0)); + _sum0 = __lsx_vfmadd_s(_w, _val, _sum0); + + m += 1; + kptr += 4; + } + + _sum0 = __lsx_vfadd_s(_sum0, _sum1); + _sum2 = __lsx_vfadd_s(_sum2, _sum3); + _sum0 = __lsx_vfadd_s(_sum0, _sum2); + + _sum0 = activation_ps(_sum0, activation_type, activation_params); + + __lsx_vst(_sum0, outptr, 0); + outptr += 4; + } + } + + if (elempack == 4 && num_output_elempack == 1) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output; p++) + { + const unsigned short* kptr = weight_data_tm.row(p); + const float* m = bottom_blob.row(j); + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum = __lsx_vreplfr2vr_s(bias_data[p]); + } + + for (int i = 0; i < num_input; i++) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(kptr + 4); + __m128 _val = (__m128)__lsx_vld(m, 0); + __m128 _k = __lsx_vreplfr2vr_s(float16_to_float32(kptr[0])); + _sum = __lsx_vfmadd_s(_k, _val, _sum); + + m += 4; + kptr += 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr, 0); + outptr += 4; + } + } + + if (elempack == 1 && num_output_elempack == 1) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output; p++) + { + const unsigned short* kptr = weight_data_tm.row(p); + const float* m = bottom_blob.row(j); + + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + int i = 0; + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(kptr + 16); + __m128 _m = (__m128)__lsx_vld(m, 0); + __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(kptr, 0)); + _sum = __lsx_vfmadd_s(_w, _m, _sum); + + m += 4; + kptr += 4; + } + sum += __lsx_reduce_fadd_s(_sum); + for (; i < num_input; i++) + { + sum += *m * float16_to_float32(*kptr); + + m += 1; + kptr += 1; + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[0] = sum; + outptr += 1; + } + } + } + + return 0; + } + + // flatten + Mat bottom_blob_flattened = bottom_blob; + if (bottom_blob.dims != 1) + { + Option opt_flatten = opt; + opt_flatten.blob_allocator = opt.workspace_allocator; + + flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); + } + + size_t elemsize = bottom_blob_flattened.elemsize; + int elempack = bottom_blob_flattened.elempack; + + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0); + } + + const unsigned short* kptr = weight_data_tm.row(p); + + const float* sptr = bottom_blob_flattened; + + int i = 0; + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(sptr + 16); + __builtin_prefetch(kptr + 64); + __m128i _val = __lsx_vld(sptr, 0); + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _w23 = __lsx_vld(kptr + 8, 0); + __m128 _w0 = __lsx_vfcvtl_s_h(_w01); + __m128 _w1 = __lsx_vfcvth_s_h(_w01); + __m128 _w2 = __lsx_vfcvtl_s_h(_w23); + __m128 _w3 = __lsx_vfcvth_s_h(_w23); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3); + + sptr += 4; + kptr += 16; + } + for (; i < num_input; i++) + { + __m128 _val = __lsx_vreplfr2vr_s(sptr[0]); + __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(kptr, 0)); + _sum0 = __lsx_vfmadd_s(_w, _val, _sum0); + + sptr += 1; + kptr += 4; + } + + _sum0 = __lsx_vfadd_s(_sum0, _sum1); + _sum2 = __lsx_vfadd_s(_sum2, _sum3); + _sum0 = __lsx_vfadd_s(_sum0, _sum2); + + _sum0 = activation_ps(_sum0, activation_type, activation_params); + + float* outptr = top_blob; + __lsx_vst(_sum0, outptr + p * 4, 0); + } + } + + if (out_elempack == 1) + { + int nn_num_output = num_output / 4; + int remain_num_output_start = nn_num_output * 4; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_num_output; pp++) + { + int p = pp * 4; + + float sum0 = 0.f; + float sum1 = 0.f; + float sum2 = 0.f; + float sum3 = 0.f; + + if (bias_term) + { + sum0 = bias_data[p]; + sum1 = bias_data[p + 1]; + sum2 = bias_data[p + 2]; + sum3 = bias_data[p + 3]; + } + + const unsigned short* w0 = weight_data_tm.row(p); + const unsigned short* w1 = weight_data_tm.row(p + 1); + const unsigned short* w2 = weight_data_tm.row(p + 2); + const unsigned short* w3 = weight_data_tm.row(p + 3); + + const float* m = bottom_blob_flattened; + + int i = 0; + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(w0 + 16); + __builtin_prefetch(w1 + 16); + __builtin_prefetch(w2 + 16); + __builtin_prefetch(w3 + 16); + __m128 _m = (__m128)__lsx_vld(m, 0); + __m128 _w0 = __lsx_vfcvtl_s_h(__lsx_vld(w0, 0)); + __m128 _w1 = __lsx_vfcvtl_s_h(__lsx_vld(w1, 0)); + __m128 _w2 = __lsx_vfcvtl_s_h(__lsx_vld(w2, 0)); + __m128 _w3 = __lsx_vfcvtl_s_h(__lsx_vld(w3, 0)); + _sum0 = __lsx_vfmadd_s(_w0, _m, _sum0); + _sum1 = __lsx_vfmadd_s(_w1, _m, _sum1); + _sum2 = __lsx_vfmadd_s(_w2, _m, _sum2); + _sum3 = __lsx_vfmadd_s(_w3, _m, _sum3); + + m += 4; + w0 += 4; + w1 += 4; + w2 += 4; + w3 += 4; + } + for (; i < num_input; i++) + { + sum0 += *m * float16_to_float32(*w0); + sum1 += *m * float16_to_float32(*w1); + sum2 += *m * float16_to_float32(*w2); + sum3 += *m * float16_to_float32(*w3); + + m++; + w0++; + w1++; + w2++; + w3++; + } + + sum0 += __lsx_reduce_fadd_s(_sum0); + sum1 += __lsx_reduce_fadd_s(_sum1); + sum2 += __lsx_reduce_fadd_s(_sum2); + sum3 += __lsx_reduce_fadd_s(_sum3); + + sum0 = activation_ss(sum0, activation_type, activation_params); + sum1 = activation_ss(sum1, activation_type, activation_params); + sum2 = activation_ss(sum2, activation_type, activation_params); + sum3 = activation_ss(sum3, activation_type, activation_params); + + top_blob[p] = sum0; + top_blob[p + 1] = sum1; + top_blob[p + 2] = sum2; + top_blob[p + 3] = sum3; + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_num_output_start; p < num_output; p++) + { + float sum = 0.f; + + if (bias_term) + sum = bias_data[p]; + + const unsigned short* w = weight_data_tm.row(p); + + const float* m = bottom_blob_flattened; + + int i = 0; + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(w + 16); + __m128 _m = (__m128)__lsx_vld(m, 0); + __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(w, 0)); + _sum0 = __lsx_vfmadd_s(_w, _m, _sum0); + + m += 4; + w += 4; + } + sum += __lsx_reduce_fadd_s(_sum0); + for (; i < num_input; i++) + { + sum += *m * float16_to_float32(*w); + + m++; + w++; + } + + sum = activation_ss(sum, activation_type, activation_params); + + top_blob[p] = sum; + } + } + + return 0; +} +#endif // __loongarch_sx + +#if NCNN_INT8 +int InnerProduct_loongarch::create_pipeline_int8_loongarch(const Option& opt) +{ + const int num_input = weight_data_size / num_output; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 8 == 0 ? 8 : 1; + } +#endif // __loongarch_sx + + // src = inch-outch + // dst = pb-inch-outch/pb + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + + weight_data_tm.create(num_input, num_output / out_elempack, (size_t)out_elempack, out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + signed char* g0 = weight_data_tm.row(q / out_elempack); + + for (int p = 0; p < num_input; p++) + { + for (int j = 0; j < out_elempack; j++) + { + *g0++ = weight_data_r2.row(q + j)[p]; + } + } + } + } + + scale_in_data.create(num_output); + for (int p = 0; p < num_output; p++) + { + // dequantize + float scale_in; + if (weight_data_int8_scales[p] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); + + scale_in_data[p] = scale_in; + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int InnerProduct_loongarch::forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int num_input = weight_data_size / num_output; + + int elembits = bottom_blob.elembits(); + + Mat bottom_blob_int8 = bottom_blob; + if (elembits != 8) + { + Option opt_q = opt; + opt_q.blob_allocator = opt.workspace_allocator; + quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); + } + + if (bottom_blob_int8.dims == 2 && bottom_blob_int8.w == num_input && bottom_blob_int8.h * bottom_blob_int8.elempack > 1) + { + // gemm + Mat bottom_blob_int8_unpacked; + Option opt_unpack = opt; + opt_unpack.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob_int8, bottom_blob_int8_unpacked, 1, opt_unpack); + + int h = bottom_blob_int8_unpacked.h; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = h % 4 == 0 ? 4 : 1; + } +#endif + + int outh = h / out_elempack; + + top_blob.create(num_output, outh, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int num_output_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + num_output_elempack = num_output % 8 == 0 ? 8 : 1; + } +#endif + +#if __loongarch_sx + if (num_output_elempack == 8 && out_elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < outh; j++) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* m0 = bottom_blob_int8_unpacked.row(j * 4); + const signed char* m1 = bottom_blob_int8_unpacked.row(j * 4 + 1); + const signed char* m2 = bottom_blob_int8_unpacked.row(j * 4 + 2); + const signed char* m3 = bottom_blob_int8_unpacked.row(j * 4 + 3); + + __m128i _sum00 = __lsx_vreplgr2vr_w(0); + __m128i _sum01 = __lsx_vreplgr2vr_w(0); + __m128i _sum10 = __lsx_vreplgr2vr_w(0); + __m128i _sum11 = __lsx_vreplgr2vr_w(0); + __m128i _sum20 = __lsx_vreplgr2vr_w(0); + __m128i _sum21 = __lsx_vreplgr2vr_w(0); + __m128i _sum30 = __lsx_vreplgr2vr_w(0); + __m128i _sum31 = __lsx_vreplgr2vr_w(0); + + int i = 0; + for (; i < num_input; i++) + { + __builtin_prefetch(m0 + 4); + __builtin_prefetch(m1 + 4); + __builtin_prefetch(m2 + 4); + __builtin_prefetch(m3 + 4); + __builtin_prefetch(kptr + 32); + __m128i _val0 = __lsx_vreplgr2vr_h((short)m0[0]); + __m128i _val1 = __lsx_vreplgr2vr_h((short)m1[0]); + __m128i _val2 = __lsx_vreplgr2vr_h((short)m2[0]); + __m128i _val3 = __lsx_vreplgr2vr_h((short)m3[0]); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val0, _w16); + __m128i _s1 = __lsx_vmul_h(_val1, _w16); + __m128i _s2 = __lsx_vmul_h(_val2, _w16); + __m128i _s3 = __lsx_vmul_h(_val3, _w16); + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _exts1 = __lsx_vslti_h(_s1, 0); + __m128i _exts2 = __lsx_vslti_h(_s2, 0); + __m128i _exts3 = __lsx_vslti_h(_s3, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + __m128i _s1l = __lsx_vilvl_h(_exts1, _s1); + __m128i _s1h = __lsx_vilvh_h(_exts1, _s1); + __m128i _s2l = __lsx_vilvl_h(_exts2, _s2); + __m128i _s2h = __lsx_vilvh_h(_exts2, _s2); + __m128i _s3l = __lsx_vilvl_h(_exts3, _s3); + __m128i _s3h = __lsx_vilvh_h(_exts3, _s3); + + _sum00 = __lsx_vadd_w(_sum00, _s0l); + _sum01 = __lsx_vadd_w(_sum01, _s0h); + _sum10 = __lsx_vadd_w(_sum10, _s1l); + _sum11 = __lsx_vadd_w(_sum11, _s1h); + _sum20 = __lsx_vadd_w(_sum20, _s2l); + _sum21 = __lsx_vadd_w(_sum21, _s2h); + _sum30 = __lsx_vadd_w(_sum30, _s3l); + _sum31 = __lsx_vadd_w(_sum31, _s3h); + + m0++; + m1++; + m2++; + m3++; + kptr += 8; + } + + // dequantize and relu + __m128 _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8, 0); + __m128 _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8 + 4, 0); + + __m128 _sumfp32_00 = __lsx_vffint_s_w(_sum00); + __m128 _sumfp32_01 = __lsx_vffint_s_w(_sum01); + __m128 _sumfp32_10 = __lsx_vffint_s_w(_sum10); + __m128 _sumfp32_11 = __lsx_vffint_s_w(_sum11); + __m128 _sumfp32_20 = __lsx_vffint_s_w(_sum20); + __m128 _sumfp32_21 = __lsx_vffint_s_w(_sum21); + __m128 _sumfp32_30 = __lsx_vffint_s_w(_sum30); + __m128 _sumfp32_31 = __lsx_vffint_s_w(_sum31); + if (bias_term) + { + __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + p * 8, 0); + __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + p * 8 + 4, 0); + _sumfp32_00 = __lsx_vfmadd_s(_scale_in0, _sumfp32_00, _bias0); + _sumfp32_01 = __lsx_vfmadd_s(_scale_in1, _sumfp32_01, _bias1); + _sumfp32_10 = __lsx_vfmadd_s(_scale_in0, _sumfp32_10, _bias0); + _sumfp32_11 = __lsx_vfmadd_s(_scale_in1, _sumfp32_11, _bias1); + _sumfp32_20 = __lsx_vfmadd_s(_scale_in0, _sumfp32_20, _bias0); + _sumfp32_21 = __lsx_vfmadd_s(_scale_in1, _sumfp32_21, _bias1); + _sumfp32_30 = __lsx_vfmadd_s(_scale_in0, _sumfp32_30, _bias0); + _sumfp32_31 = __lsx_vfmadd_s(_scale_in1, _sumfp32_31, _bias1); + } + else + { + _sumfp32_00 = __lsx_vfmul_s(_sumfp32_00, _scale_in0); + _sumfp32_01 = __lsx_vfmul_s(_sumfp32_01, _scale_in1); + _sumfp32_10 = __lsx_vfmul_s(_sumfp32_10, _scale_in0); + _sumfp32_11 = __lsx_vfmul_s(_sumfp32_11, _scale_in1); + _sumfp32_20 = __lsx_vfmul_s(_sumfp32_20, _scale_in0); + _sumfp32_21 = __lsx_vfmul_s(_sumfp32_21, _scale_in1); + _sumfp32_30 = __lsx_vfmul_s(_sumfp32_30, _scale_in0); + _sumfp32_31 = __lsx_vfmul_s(_sumfp32_31, _scale_in1); + } + + _sumfp32_00 = activation_ps(_sumfp32_00, activation_type, activation_params); + _sumfp32_01 = activation_ps(_sumfp32_01, activation_type, activation_params); + _sumfp32_10 = activation_ps(_sumfp32_10, activation_type, activation_params); + _sumfp32_11 = activation_ps(_sumfp32_11, activation_type, activation_params); + _sumfp32_20 = activation_ps(_sumfp32_20, activation_type, activation_params); + _sumfp32_21 = activation_ps(_sumfp32_21, activation_type, activation_params); + _sumfp32_30 = activation_ps(_sumfp32_30, activation_type, activation_params); + _sumfp32_31 = activation_ps(_sumfp32_31, activation_type, activation_params); + + // transpose 4x8 + __m128i _r01r = __lsx_vilvl_w((__m128i)_sumfp32_10, (__m128i)_sumfp32_00); + __m128i _r01l = __lsx_vilvh_w((__m128i)_sumfp32_10, (__m128i)_sumfp32_00); + __m128i _r23r = __lsx_vilvl_w((__m128i)_sumfp32_30, (__m128i)_sumfp32_20); + __m128i _r23l = __lsx_vilvh_w((__m128i)_sumfp32_30, (__m128i)_sumfp32_20); + __m128i _r45r = __lsx_vilvl_w((__m128i)_sumfp32_11, (__m128i)_sumfp32_01); + __m128i _r45l = __lsx_vilvh_w((__m128i)_sumfp32_11, (__m128i)_sumfp32_01); + __m128i _r67r = __lsx_vilvl_w((__m128i)_sumfp32_31, (__m128i)_sumfp32_21); + __m128i _r67l = __lsx_vilvh_w((__m128i)_sumfp32_31, (__m128i)_sumfp32_21); + _sumfp32_00 = (__m128)__lsx_vilvl_d(_r23r, _r01r); + _sumfp32_10 = (__m128)__lsx_vilvh_d(_r23r, _r01r); + _sumfp32_20 = (__m128)__lsx_vilvl_d(_r23l, _r01l); + _sumfp32_30 = (__m128)__lsx_vilvh_d(_r23l, _r01l); + _sumfp32_01 = (__m128)__lsx_vilvl_d(_r67r, _r45r); + _sumfp32_11 = (__m128)__lsx_vilvh_d(_r67r, _r45r); + _sumfp32_21 = (__m128)__lsx_vilvl_d(_r67l, _r45l); + _sumfp32_31 = (__m128)__lsx_vilvh_d(_r67l, _r45l); + + __lsx_vst(_sumfp32_00, outptr, 0); + __lsx_vst(_sumfp32_10, outptr + 4, 0); + __lsx_vst(_sumfp32_20, outptr + 8, 0); + __lsx_vst(_sumfp32_30, outptr + 12, 0); + __lsx_vst(_sumfp32_01, outptr + 16, 0); + __lsx_vst(_sumfp32_11, outptr + 20, 0); + __lsx_vst(_sumfp32_21, outptr + 24, 0); + __lsx_vst(_sumfp32_31, outptr + 28, 0); + + outptr += 32; + } + } + } + + if (num_output_elempack == 1 && out_elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < outh; j++) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* m0 = bottom_blob_int8_unpacked.row(j * 4); + const signed char* m1 = bottom_blob_int8_unpacked.row(j * 4 + 1); + const signed char* m2 = bottom_blob_int8_unpacked.row(j * 4 + 2); + const signed char* m3 = bottom_blob_int8_unpacked.row(j * 4 + 3); + + int sum0 = 0; + int sum1 = 0; + int sum2 = 0; + int sum3 = 0; + + int i = 0; + for (; i < num_input; i++) + { + sum0 += *m0++ * kptr[0]; + sum1 += *m1++ * kptr[0]; + sum2 += *m2++ * kptr[0]; + sum3 += *m3++ * kptr[0]; + kptr += 1; + } + + // dequantize and relu + float sumfp32_0 = sum0 * scale_in_data[p]; + float sumfp32_1 = sum1 * scale_in_data[p]; + float sumfp32_2 = sum2 * scale_in_data[p]; + float sumfp32_3 = sum3 * scale_in_data[p]; + + if (bias_term) + { + sumfp32_0 += bias_data[p]; + sumfp32_1 += bias_data[p]; + sumfp32_2 += bias_data[p]; + sumfp32_3 += bias_data[p]; + } + + outptr[0] = activation_ss(sumfp32_0, activation_type, activation_params); + outptr[1] = activation_ss(sumfp32_1, activation_type, activation_params); + outptr[2] = activation_ss(sumfp32_2, activation_type, activation_params); + outptr[3] = activation_ss(sumfp32_3, activation_type, activation_params); + outptr += 4; + } + } + } + + if (num_output_elempack == 8 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < outh; j++) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* m = bottom_blob_int8_unpacked.row(j); + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + int i = 0; + for (; i < num_input; i++) + { + __builtin_prefetch(m + 4); + __builtin_prefetch(kptr + 32); + __m128i _val = __lsx_vreplgr2vr_h((short)m[0]); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val, _w16); + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + + _sum0 = __lsx_vadd_w(_sum0, _s0l); + _sum1 = __lsx_vadd_w(_sum1, _s0h); + + m++; + kptr += 8; + } + + // dequantize and relu + __m128 _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8, 0); + __m128 _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8 + 4, 0); + + __m128 _sumfp32_0 = __lsx_vffint_s_w(_sum0); + __m128 _sumfp32_1 = __lsx_vffint_s_w(_sum1); + + if (bias_term) + { + __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + p * 8, 0); + __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + p * 8 + 4, 0); + _sumfp32_0 = __lsx_vfmadd_s(_scale_in0, _sumfp32_0, _bias0); + _sumfp32_1 = __lsx_vfmadd_s(_scale_in1, _sumfp32_1, _bias1); + } + else + { + _sumfp32_0 = __lsx_vfmul_s(_sumfp32_0, _scale_in0); + _sumfp32_1 = __lsx_vfmul_s(_sumfp32_1, _scale_in1); + } + + _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params); + _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params); + + __lsx_vst(_sumfp32_0, outptr, 0); + __lsx_vst(_sumfp32_1, outptr + 4, 0); + outptr += 8; + } + } + } +#endif // __loongarch_sx + + if (num_output_elempack == 1 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < outh; j++) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* m = bottom_blob_int8_unpacked.row(j); + + int sum = 0; + + int i = 0; + for (; i < num_input; i++) + { + sum += *m++ * *kptr++; + } + + // dequantize and relu + float sumfp32 = sum * scale_in_data[p]; + + if (bias_term) + sumfp32 += bias_data[p]; + + outptr[0] = activation_ss(sumfp32, activation_type, activation_params); + outptr += 1; + } + } + } + + return 0; + } + + Mat bottom_blob_int8_flattened = bottom_blob_int8; + if (bottom_blob_int8.dims != 1) + { + Option opt_flatten = opt; + opt_flatten.blob_allocator = opt.workspace_allocator; + flatten->forward(bottom_blob_int8, bottom_blob_int8_flattened, opt_flatten); + } + + // int elempack = bottom_blob_int8_flattened.elempack; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 8 == 0 ? 8 : 1; + } +#endif // __loongarch_sx + // size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __loongarch_sx + if (out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + const signed char* kptr = weight_data_tm.row(p); + const signed char* sptr = bottom_blob_int8_flattened; + + int i = 0; + for (; i < num_input; i++) + { + __builtin_prefetch(sptr + 4); + __builtin_prefetch(kptr + 32); + __m128i _val = __lsx_vreplgr2vr_h((short)sptr[0]); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val, _w16); + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + + _sum0 = __lsx_vadd_w(_sum0, _s0l); + _sum1 = __lsx_vadd_w(_sum1, _s0h); + + sptr += 1; + kptr += 8; + } + + // dequantize and relu + __m128 _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8, 0); + __m128 _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8 + 4, 0); + + __m128 _sumfp32_0 = __lsx_vffint_s_w(_sum0); + __m128 _sumfp32_1 = __lsx_vffint_s_w(_sum1); + + if (bias_term) + { + __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + p * 8, 0); + __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + p * 8 + 4, 0); + _sumfp32_0 = __lsx_vfmadd_s(_scale_in0, _sumfp32_0, _bias0); + _sumfp32_1 = __lsx_vfmadd_s(_scale_in1, _sumfp32_1, _bias1); + } + else + { + _sumfp32_0 = __lsx_vfmul_s(_sumfp32_0, _scale_in0); + _sumfp32_1 = __lsx_vfmul_s(_sumfp32_1, _scale_in1); + } + + _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params); + _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params); + + float* outptr = (float*)top_blob + p * 8; + __lsx_vst(_sumfp32_0, outptr, 0); + __lsx_vst(_sumfp32_1, outptr + 4, 0); + } + } +#endif // __loongarch_sx + + if (out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + int sum = 0; + + const signed char* kptr = weight_data_tm.row(p); + const signed char* sptr = bottom_blob_int8_flattened; + + int i = 0; + for (; i < num_input; i++) + { + signed char val = sptr[0]; + + signed char w = kptr[0]; + + sum += val * w; + + sptr += 1; + kptr += 1; + } + + // dequantize and relu + float sumfp32 = sum * scale_in_data[p]; + + if (bias_term) + sumfp32 += bias_data[p]; + + sumfp32 = activation_ss(sumfp32, activation_type, activation_params); + + top_blob[p] = sumfp32; + } + } + + return 0; +} +#endif // NCNN_INT8 + +} // namespace ncnn diff --git a/src/layer/loongarch/innerproduct_loongarch.h b/src/layer/loongarch/innerproduct_loongarch.h new file mode 100644 index 000000000000..4d9574ce9192 --- /dev/null +++ b/src/layer/loongarch/innerproduct_loongarch.h @@ -0,0 +1,54 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_INNERPRODUCT_LOONGARCH_H +#define LAYER_INNERPRODUCT_LOONGARCH_H + +#include "innerproduct.h" + +namespace ncnn { + +class InnerProduct_loongarch : virtual public InnerProduct +{ +public: + InnerProduct_loongarch(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +protected: +#if __loongarch_sx + int create_pipeline_fp16s(const Option& opt); + int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif +#if NCNN_INT8 + int create_pipeline_int8_loongarch(const Option& opt); + int forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif + +public: + Layer* flatten; + + Mat weight_data_tm; + +#if NCNN_INT8 + Mat scale_in_data; +#endif +}; + +} // namespace ncnn + +#endif // LAYER_INNERPRODUCT_LOONGARCH_H diff --git a/src/layer/loongarch/interp_bicubic.h b/src/layer/loongarch/interp_bicubic.h new file mode 100644 index 000000000000..e52ba81de4f0 --- /dev/null +++ b/src/layer/loongarch/interp_bicubic.h @@ -0,0 +1,261 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static inline void interpolate_cubic(float fx, float* coeffs) +{ + const float A = -0.75f; + + float fx0 = fx + 1; + float fx1 = fx; + float fx2 = 1 - fx; + // float fx3 = 2 - fx; + + coeffs[0] = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A; + coeffs[1] = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1; + coeffs[2] = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} + +static void cubic_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner) +{ + double scale = (double)w / outw; + if (align_corner) + { + scale = (double)(w - 1) / (outw - 1); + } + + for (int dx = 0; dx < outw; dx++) + { + float fx = (float)((dx + 0.5) * scale - 0.5); + if (align_corner) + { + fx = (float)(dx * scale); + } + + int sx = static_cast(floor(fx)); + fx -= sx; + + interpolate_cubic(fx, alpha + dx * 4); + + if (sx <= -1) + { + sx = 1; + alpha[dx * 4 + 0] = 1.f - alpha[dx * 4 + 3]; + alpha[dx * 4 + 1] = alpha[dx * 4 + 3]; + alpha[dx * 4 + 2] = 0.f; + alpha[dx * 4 + 3] = 0.f; + } + if (sx == 0) + { + sx = 1; + alpha[dx * 4 + 0] = alpha[dx * 4 + 0] + alpha[dx * 4 + 1]; + alpha[dx * 4 + 1] = alpha[dx * 4 + 2]; + alpha[dx * 4 + 2] = alpha[dx * 4 + 3]; + alpha[dx * 4 + 3] = 0.f; + } + if (sx == w - 2) + { + sx = w - 3; + alpha[dx * 4 + 3] = alpha[dx * 4 + 2] + alpha[dx * 4 + 3]; + alpha[dx * 4 + 2] = alpha[dx * 4 + 1]; + alpha[dx * 4 + 1] = alpha[dx * 4 + 0]; + alpha[dx * 4 + 0] = 0.f; + } + if (sx >= w - 1) + { + sx = w - 3; + alpha[dx * 4 + 3] = 1.f - alpha[dx * 4 + 0]; + alpha[dx * 4 + 2] = alpha[dx * 4 + 0]; + alpha[dx * 4 + 1] = 0.f; + alpha[dx * 4 + 0] = 0.f; + } + + xofs[dx] = sx; + } +} + +static void resize_bicubic_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) +{ + int w = dst.w; + int h = dst.h; + + // loop body + Mat rowsbuf0(w); + Mat rowsbuf1(w); + Mat rowsbuf2(w); + Mat rowsbuf3(w); + float* rows0 = rowsbuf0; + float* rows1 = rowsbuf1; + float* rows2 = rowsbuf2; + float* rows3 = rowsbuf3; + + int prev_sy1 = -3; + + for (int dy = 0; dy < h; dy++) + { + int sy = yofs[dy]; + + if (sy == prev_sy1) + { + // reuse all rows + } + else if (sy == prev_sy1 + 1) + { + // hresize one row + float* rows0_old = rows0; + rows0 = rows1; + rows1 = rows2; + rows2 = rows3; + rows3 = rows0_old; + const float* S3 = src.row(sy + 2); + + const float* alphap = alpha; + float* rows3p = rows3; + for (int dx = 0; dx < w; dx++) + { + int sx = xofs[dx]; + const float* S3p = S3 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + float a2 = alphap[2]; + float a3 = alphap[3]; + rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3; + + alphap += 4; + } + } + else if (sy == prev_sy1 + 2) + { + // hresize two rows + float* rows0_old = rows0; + float* rows1_old = rows1; + rows0 = rows2; + rows1 = rows3; + rows2 = rows0_old; + rows3 = rows1_old; + const float* S2 = src.row(sy + 1); + const float* S3 = src.row(sy + 2); + + const float* alphap = alpha; + float* rows2p = rows2; + float* rows3p = rows3; + for (int dx = 0; dx < w; dx++) + { + int sx = xofs[dx]; + const float* S2p = S2 + sx; + const float* S3p = S3 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + float a2 = alphap[2]; + float a3 = alphap[3]; + rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3; + rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3; + + alphap += 4; + } + } + else if (sy == prev_sy1 + 3) + { + // hresize three rows + float* rows0_old = rows0; + float* rows1_old = rows1; + float* rows2_old = rows2; + rows0 = rows3; + rows1 = rows0_old; + rows2 = rows1_old; + rows3 = rows2_old; + const float* S1 = src.row(sy); + const float* S2 = src.row(sy + 1); + const float* S3 = src.row(sy + 2); + + const float* alphap = alpha; + float* rows1p = rows1; + float* rows2p = rows2; + float* rows3p = rows3; + for (int dx = 0; dx < w; dx++) + { + int sx = xofs[dx]; + const float* S1p = S1 + sx; + const float* S2p = S2 + sx; + const float* S3p = S3 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + float a2 = alphap[2]; + float a3 = alphap[3]; + rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3; + rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3; + rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3; + + alphap += 4; + } + } + else + { + // hresize four rows + const float* S0 = src.row(sy - 1); + const float* S1 = src.row(sy); + const float* S2 = src.row(sy + 1); + const float* S3 = src.row(sy + 2); + + const float* alphap = alpha; + float* rows0p = rows0; + float* rows1p = rows1; + float* rows2p = rows2; + float* rows3p = rows3; + for (int dx = 0; dx < w; dx++) + { + int sx = xofs[dx]; + const float* S0p = S0 + sx; + const float* S1p = S1 + sx; + const float* S2p = S2 + sx; + const float* S3p = S3 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + float a2 = alphap[2]; + float a3 = alphap[3]; + rows0p[dx] = S0p[-1] * a0 + S0p[0] * a1 + S0p[1] * a2 + S0p[2] * a3; + rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3; + rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3; + rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3; + + alphap += 4; + } + } + + prev_sy1 = sy; + + // vresize + float b0 = beta[0]; + float b1 = beta[1]; + float b2 = beta[2]; + float b3 = beta[3]; + + float* rows0p = rows0; + float* rows1p = rows1; + float* rows2p = rows2; + float* rows3p = rows3; + float* Dp = dst.row(dy); + for (int dx = 0; dx < w; dx++) + { + // D[x] = rows0[x]*b0 + rows1[x]*b1 + rows2[x]*b2 + rows3[x]*b3; + *Dp++ = *rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3; + } + + beta += 4; + } +} diff --git a/src/layer/loongarch/interp_bicubic_pack4.h b/src/layer/loongarch/interp_bicubic_pack4.h new file mode 100644 index 000000000000..54281691ad79 --- /dev/null +++ b/src/layer/loongarch/interp_bicubic_pack4.h @@ -0,0 +1,286 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void resize_bicubic_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) +{ + int w = dst.w; + int h = dst.h; + + // loop body + Mat rowsbuf0(w, (size_t)4 * 4u, 4); + Mat rowsbuf1(w, (size_t)4 * 4u, 4); + Mat rowsbuf2(w, (size_t)4 * 4u, 4); + Mat rowsbuf3(w, (size_t)4 * 4u, 4); + float* rows0 = rowsbuf0; + float* rows1 = rowsbuf1; + float* rows2 = rowsbuf2; + float* rows3 = rowsbuf3; + + int prev_sy1 = -3; + + for (int dy = 0; dy < h; dy++) + { + int sy = yofs[dy]; + + if (sy == prev_sy1) + { + // reuse all rows + } + else if (sy == prev_sy1 + 1) + { + // hresize one row + float* rows0_old = rows0; + rows0 = rows1; + rows1 = rows2; + rows2 = rows3; + rows3 = rows0_old; + const float* S3 = src.row(sy + 2); + + const float* alphap = alpha; + float* rows3p = rows3; + for (int dx = 0; dx < w; dx++) + { + int sx = xofs[dx] * 4; + const float* S3p = S3 + sx; + + __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]); + __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]); + __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]); + __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]); + + __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0); + __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0); + __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0); + __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0); + __m128 _rows3 = __lsx_vfmul_s(_S30, _a0); + _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3); + _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3); + _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3); + __lsx_vst(_rows3, rows3p + dx * 4, 0); + + alphap += 4; + } + } + else if (sy == prev_sy1 + 2) + { + // hresize two rows + float* rows0_old = rows0; + float* rows1_old = rows1; + rows0 = rows2; + rows1 = rows3; + rows2 = rows0_old; + rows3 = rows1_old; + const float* S2 = src.row(sy + 1); + const float* S3 = src.row(sy + 2); + + const float* alphap = alpha; + float* rows2p = rows2; + float* rows3p = rows3; + for (int dx = 0; dx < w; dx++) + { + int sx = xofs[dx] * 4; + const float* S2p = S2 + sx; + const float* S3p = S3 + sx; + + __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]); + __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]); + __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]); + __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]); + + __m128 _S20 = (__m128)__lsx_vld(S2p - 4, 0); + __m128 _S21 = (__m128)__lsx_vld(S2p + 0, 0); + __m128 _S22 = (__m128)__lsx_vld(S2p + 4, 0); + __m128 _S23 = (__m128)__lsx_vld(S2p + 8, 0); + __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0); + __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0); + __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0); + __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0); + __m128 _rows2 = __lsx_vfmul_s(_S20, _a0); + __m128 _rows3 = __lsx_vfmul_s(_S30, _a0); + _rows2 = __lsx_vfmadd_s(_a1, _S21, _rows2); + _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3); + _rows2 = __lsx_vfmadd_s(_a2, _S22, _rows2); + _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3); + _rows2 = __lsx_vfmadd_s(_a3, _S23, _rows2); + _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3); + __lsx_vst(_rows2, rows2p + dx * 4, 0); + __lsx_vst(_rows3, rows3p + dx * 4, 0); + + alphap += 4; + } + } + else if (sy == prev_sy1 + 3) + { + // hresize three rows + float* rows0_old = rows0; + float* rows1_old = rows1; + float* rows2_old = rows2; + rows0 = rows3; + rows1 = rows0_old; + rows2 = rows1_old; + rows3 = rows2_old; + const float* S1 = src.row(sy); + const float* S2 = src.row(sy + 1); + const float* S3 = src.row(sy + 2); + + const float* alphap = alpha; + float* rows1p = rows1; + float* rows2p = rows2; + float* rows3p = rows3; + for (int dx = 0; dx < w; dx++) + { + int sx = xofs[dx] * 4; + const float* S1p = S1 + sx; + const float* S2p = S2 + sx; + const float* S3p = S3 + sx; + + __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]); + __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]); + __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]); + __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]); + + __m128 _S10 = (__m128)__lsx_vld(S1p - 4, 0); + __m128 _S11 = (__m128)__lsx_vld(S1p + 0, 0); + __m128 _S12 = (__m128)__lsx_vld(S1p + 4, 0); + __m128 _S13 = (__m128)__lsx_vld(S1p + 8, 0); + __m128 _S20 = (__m128)__lsx_vld(S2p - 4, 0); + __m128 _S21 = (__m128)__lsx_vld(S2p + 0, 0); + __m128 _S22 = (__m128)__lsx_vld(S2p + 4, 0); + __m128 _S23 = (__m128)__lsx_vld(S2p + 8, 0); + __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0); + __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0); + __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0); + __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0); + __m128 _rows1 = __lsx_vfmul_s(_S10, _a0); + __m128 _rows2 = __lsx_vfmul_s(_S20, _a0); + __m128 _rows3 = __lsx_vfmul_s(_S30, _a0); + _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1); + _rows2 = __lsx_vfmadd_s(_a1, _S21, _rows2); + _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3); + _rows1 = __lsx_vfmadd_s(_a2, _S12, _rows1); + _rows2 = __lsx_vfmadd_s(_a2, _S22, _rows2); + _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3); + _rows1 = __lsx_vfmadd_s(_a3, _S13, _rows1); + _rows2 = __lsx_vfmadd_s(_a3, _S23, _rows2); + _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3); + __lsx_vst(_rows1, rows1p + dx * 4, 0); + __lsx_vst(_rows2, rows2p + dx * 4, 0); + __lsx_vst(_rows3, rows3p + dx * 4, 0); + + alphap += 4; + } + } + else + { + // hresize four rows + const float* S0 = src.row(sy - 1); + const float* S1 = src.row(sy); + const float* S2 = src.row(sy + 1); + const float* S3 = src.row(sy + 2); + + const float* alphap = alpha; + float* rows0p = rows0; + float* rows1p = rows1; + float* rows2p = rows2; + float* rows3p = rows3; + for (int dx = 0; dx < w; dx++) + { + int sx = xofs[dx] * 4; + const float* S0p = S0 + sx; + const float* S1p = S1 + sx; + const float* S2p = S2 + sx; + const float* S3p = S3 + sx; + + __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]); + __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]); + __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]); + __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]); + + __m128 _S00 = (__m128)__lsx_vld(S0p - 4, 0); + __m128 _S01 = (__m128)__lsx_vld(S0p + 0, 0); + __m128 _S02 = (__m128)__lsx_vld(S0p + 4, 0); + __m128 _S03 = (__m128)__lsx_vld(S0p + 8, 0); + __m128 _S10 = (__m128)__lsx_vld(S1p - 4, 0); + __m128 _S11 = (__m128)__lsx_vld(S1p + 0, 0); + __m128 _S12 = (__m128)__lsx_vld(S1p + 4, 0); + __m128 _S13 = (__m128)__lsx_vld(S1p + 8, 0); + __m128 _S20 = (__m128)__lsx_vld(S2p - 4, 0); + __m128 _S21 = (__m128)__lsx_vld(S2p + 0, 0); + __m128 _S22 = (__m128)__lsx_vld(S2p + 4, 0); + __m128 _S23 = (__m128)__lsx_vld(S2p + 8, 0); + __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0); + __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0); + __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0); + __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0); + __m128 _rows0 = __lsx_vfmul_s(_S00, _a0); + __m128 _rows1 = __lsx_vfmul_s(_S10, _a0); + __m128 _rows2 = __lsx_vfmul_s(_S20, _a0); + __m128 _rows3 = __lsx_vfmul_s(_S30, _a0); + _rows0 = __lsx_vfmadd_s(_a1, _S01, _rows0); + _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1); + _rows2 = __lsx_vfmadd_s(_a1, _S21, _rows2); + _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3); + _rows0 = __lsx_vfmadd_s(_a2, _S02, _rows0); + _rows1 = __lsx_vfmadd_s(_a2, _S12, _rows1); + _rows2 = __lsx_vfmadd_s(_a2, _S22, _rows2); + _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3); + _rows0 = __lsx_vfmadd_s(_a3, _S03, _rows0); + _rows1 = __lsx_vfmadd_s(_a3, _S13, _rows1); + _rows2 = __lsx_vfmadd_s(_a3, _S23, _rows2); + _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3); + __lsx_vst(_rows0, rows0p + dx * 4, 0); + __lsx_vst(_rows1, rows1p + dx * 4, 0); + __lsx_vst(_rows2, rows2p + dx * 4, 0); + __lsx_vst(_rows3, rows3p + dx * 4, 0); + + alphap += 4; + } + } + + prev_sy1 = sy; + + // vresize + __m128 _b0 = __lsx_vreplfr2vr_s(beta[0]); + __m128 _b1 = __lsx_vreplfr2vr_s(beta[1]); + __m128 _b2 = __lsx_vreplfr2vr_s(beta[2]); + __m128 _b3 = __lsx_vreplfr2vr_s(beta[3]); + + float* rows0p = rows0; + float* rows1p = rows1; + float* rows2p = rows2; + float* rows3p = rows3; + float* Dp = dst.row(dy); + + for (int dx = 0; dx < w; dx++) + { + __m128 _rows0 = (__m128)__lsx_vld(rows0p, 0); + __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0); + __m128 _rows2 = (__m128)__lsx_vld(rows2p, 0); + __m128 _rows3 = (__m128)__lsx_vld(rows3p, 0); + __m128 _D = __lsx_vfmul_s(_rows0, _b0); + _D = __lsx_vfmadd_s(_b1, _rows1, _D); + _D = __lsx_vfmadd_s(_b2, _rows2, _D); + _D = __lsx_vfmadd_s(_b3, _rows3, _D); + __lsx_vst(_D, Dp, 0); + + Dp += 4; + rows0p += 4; + rows1p += 4; + rows2p += 4; + rows3p += 4; + } + + beta += 4; + } +} diff --git a/src/layer/loongarch/interp_bilinear.h b/src/layer/loongarch/interp_bilinear.h new file mode 100644 index 000000000000..ad5a28672bef --- /dev/null +++ b/src/layer/loongarch/interp_bilinear.h @@ -0,0 +1,172 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void linear_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner) +{ + double scale = (double)w / outw; + if (align_corner) + { + scale = (double)(w - 1) / (outw - 1); + } + + for (int dx = 0; dx < outw; dx++) + { + float fx = (float)((dx + 0.5) * scale - 0.5); + if (align_corner) + { + fx = (float)(dx * scale); + } + + int sx = floor(fx); + fx -= sx; + + if (sx < 0) + { + sx = 0; + fx = 0.f; + } + if (sx >= w - 1) + { + sx = w - 2; + fx = 1.f; + } + + xofs[dx] = sx; + + alpha[dx * 2] = 1.f - fx; + alpha[dx * 2 + 1] = fx; + } +} + +static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) +{ + int w = dst.w; + int h = dst.h; + + // loop body + Mat rowsbuf0(w); + Mat rowsbuf1(w); + float* rows0 = rowsbuf0; + float* rows1 = rowsbuf1; + + int prev_sy1 = -2; + + for (int dy = 0; dy < h; dy++) + { + int sy = yofs[dy]; + + if (sy == prev_sy1) + { + // reuse all rows + } + else if (sy == prev_sy1 + 1) + { + // hresize one row + float* rows0_old = rows0; + rows0 = rows1; + rows1 = rows0_old; + const float* S1 = src.row(sy + 1); + + const float* alphap = alpha; + float* rows1p = rows1; + int dx = 0; + for (; dx < w; dx++) + { + int sx = xofs[dx]; + const float* S1p = S1 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + rows1p[dx] = S1p[0] * a0 + S1p[1] * a1; + + alphap += 2; + } + } + else + { + // hresize two rows + const float* S0 = src.row(sy); + const float* S1 = src.row(sy + 1); + + const float* alphap = alpha; + float* rows0p = rows0; + float* rows1p = rows1; + int dx = 0; + for (; dx < w; dx++) + { + int sx = xofs[dx]; + const float* S0p = S0 + sx; + const float* S1p = S1 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + rows0p[dx] = S0p[0] * a0 + S0p[1] * a1; + rows1p[dx] = S1p[0] * a0 + S1p[1] * a1; + + alphap += 2; + } + } + + prev_sy1 = sy; + + // vresize + float b0 = beta[0]; + float b1 = beta[1]; + + float* rows0p = rows0; + float* rows1p = rows1; + float* Dp = dst.row(dy); + +#if __loongarch_sx + int nn = w >> 3; +#else + int nn = 0; +#endif + int remain = w - (nn << 3); + +#if __loongarch_sx + __m128 _b0 = __lsx_vreplfr2vr_s(b0); + __m128 _b1 = __lsx_vreplfr2vr_s(b1); + for (; nn > 0; nn--) + { + __m128 _rows0 = (__m128)__lsx_vld(rows0p, 0); + __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0); + + __m128 _D = __lsx_vfmul_s(_rows0, _b0); + _D = __lsx_vfmadd_s(_b1, _rows1, _D); + + __lsx_vst(_D, Dp, 0); + + __m128 _rows0n = (__m128)__lsx_vld(rows0p + 4, 0); + __m128 _rows1n = (__m128)__lsx_vld(rows1p + 4, 0); + + __m128 _Dn = __lsx_vfmul_s(_rows0n, _b0); + _Dn = __lsx_vfmadd_s(_b1, _rows1n, _Dn); + + __lsx_vst(_Dn, Dp + 4, 0); + + Dp += 8; + rows0p += 8; + rows1p += 8; + } +#endif // __loongarch_sx + for (; remain; --remain) + { + // D[x] = rows0[x]*b0 + rows1[x]*b1; + *Dp++ = *rows0p++ * b0 + *rows1p++ * b1; + } + + beta += 2; + } +} diff --git a/src/layer/loongarch/interp_bilinear_pack4.h b/src/layer/loongarch/interp_bilinear_pack4.h new file mode 100644 index 000000000000..2cfb138a1cbd --- /dev/null +++ b/src/layer/loongarch/interp_bilinear_pack4.h @@ -0,0 +1,123 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void resize_bilinear_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) +{ + int w = dst.w; + int h = dst.h; + + // loop body + Mat rowsbuf0(w, (size_t)4 * 4u, 4); + Mat rowsbuf1(w, (size_t)4 * 4u, 4); + float* rows0 = rowsbuf0; + float* rows1 = rowsbuf1; + + int prev_sy1 = -2; + + for (int dy = 0; dy < h; dy++) + { + int sy = yofs[dy]; + + if (sy == prev_sy1) + { + // reuse all rows + } + else if (sy == prev_sy1 + 1) + { + // hresize one row + float* rows0_old = rows0; + rows0 = rows1; + rows1 = rows0_old; + const float* S1 = src.row(sy + 1); + + const float* alphap = alpha; + float* rows1p = rows1; + int dx = 0; + for (; dx < w; dx++) + { + int sx = xofs[dx] * 4; + const float* S1p = S1 + sx; + + __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]); + __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]); + + __m128 _S10 = (__m128)__lsx_vld(S1p, 0); + __m128 _S11 = (__m128)__lsx_vld(S1p + 4, 0); + __m128 _rows1 = __lsx_vfmul_s(_S10, _a0); + _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1); + __lsx_vst(_rows1, rows1p + dx * 4, 0); + + alphap += 2; + } + } + else + { + // hresize two rows + const float* S0 = src.row(sy); + const float* S1 = src.row(sy + 1); + + const float* alphap = alpha; + float* rows0p = rows0; + float* rows1p = rows1; + int dx = 0; + for (; dx < w; dx++) + { + int sx = xofs[dx] * 4; + const float* S0p = S0 + sx; + const float* S1p = S1 + sx; + + __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]); + __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]); + + __m128 _S00 = (__m128)__lsx_vld(S0p, 0); + __m128 _S01 = (__m128)__lsx_vld(S0p + 4, 0); + __m128 _S10 = (__m128)__lsx_vld(S1p, 0); + __m128 _S11 = (__m128)__lsx_vld(S1p + 4, 0); + __m128 _rows0 = __lsx_vfmul_s(_S00, _a0); + __m128 _rows1 = __lsx_vfmul_s(_S10, _a0); + _rows0 = __lsx_vfmadd_s(_a1, _S01, _rows0); + _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1); + __lsx_vst(_rows0, rows0p + dx * 4, 0); + __lsx_vst(_rows1, rows1p + dx * 4, 0); + + alphap += 2; + } + } + + prev_sy1 = sy; + + // vresize + __m128 _b0 = __lsx_vreplfr2vr_s(beta[0]); + __m128 _b1 = __lsx_vreplfr2vr_s(beta[1]); + + float* rows0p = rows0; + float* rows1p = rows1; + float* Dp = dst.row(dy); + + for (int dx = 0; dx < w; dx++) + { + __m128 _rows0 = (__m128)__lsx_vld(rows0p, 0); + __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0); + __m128 _D = __lsx_vfmul_s(_rows0, _b0); + _D = __lsx_vfmadd_s(_b1, _rows1, _D); + __lsx_vst(_D, Dp, 0); + + Dp += 4; + rows0p += 4; + rows1p += 4; + } + + beta += 2; + } +} diff --git a/src/layer/loongarch/interp_loongarch.cpp b/src/layer/loongarch/interp_loongarch.cpp new file mode 100644 index 000000000000..94d25cf005eb --- /dev/null +++ b/src/layer/loongarch/interp_loongarch.cpp @@ -0,0 +1,470 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "interp_loongarch.h" + +#include + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +#include "interp_bicubic.h" +#include "interp_bilinear.h" + +#if __loongarch_sx +#include "interp_bicubic_pack4.h" +#include "interp_bilinear_pack4.h" +#endif + +Interp_loongarch::Interp_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Interp_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& reference_blob = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + int h = bottom_blob.h; + int w = bottom_blob.w; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = reference_blob.w; + int outh = reference_blob.h; + + if (dims == 1) + { + top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __loongarch_sx + if (elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < w; q++) + { + Mat top_blob_c = top_blob.channel(q); + __m128 _v = (__m128)__lsx_vld((const float*)bottom_blob + q * 4, 0); + top_blob_c.fill(_v); + } + + return 0; + } +#endif // __loongarch_sx + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < w; q++) + { + Mat top_blob_c = top_blob.channel(q); + const float v = bottom_blob[q]; + top_blob_c.fill(v); + } + + return 0; + } + + if (dims == 2) + { + if (outw == w) + { + top_blob = bottom_blob; + return 0; + } + + top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __loongarch_sx + if (elempack == 4) + { + if (resize_type == 1) // nearest + { + const float ws = output_width ? w / (float)outw : 1.f / width_scale; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const float* ptr = bottom_blob.row(y); + float* outptr = top_blob.row(y); + for (int x = 0; x < outw; x++) + { + int in_x = std::min((int)(x * ws), (w - 1)); + + __m128 _p = (__m128)__lsx_vld(ptr + in_x * 4, 0); + __lsx_vst(_p, outptr, 0); + + outptr += 4; + } + } + } + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outw * 2]; + + int* xofs = buf; + float* alpha = (float*)(buf + outw); + + linear_coeffs(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const float* ptr = bottom_blob.row(y); + float* outptr = top_blob.row(y); + const float* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x] * 4; + const float* Sp = ptr + sx; + + __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]); + __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]); + + __m128 _S0 = (__m128)__lsx_vld(Sp, 0); + __m128 _S1 = (__m128)__lsx_vld(Sp + 4, 0); + __m128 _p = __lsx_vfmul_s(_S0, _a0); + _p = __lsx_vfmadd_s(_a1, _S1, _p); + __lsx_vst(_p, outptr, 0); + + alphap += 2; + outptr += 4; + } + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outw * 4]; + + int* xofs = buf; + float* alpha = (float*)(buf + outw); + + cubic_coeffs(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const float* ptr = bottom_blob.row(y); + float* outptr = top_blob.row(y); + const float* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x] * 4; + const float* Sp = ptr + sx; + + __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]); + __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]); + __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]); + __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]); + + __m128 _S0 = (__m128)__lsx_vld(Sp - 4, 0); + __m128 _S1 = (__m128)__lsx_vld(Sp + 0, 0); + __m128 _S2 = (__m128)__lsx_vld(Sp + 4, 0); + __m128 _S3 = (__m128)__lsx_vld(Sp + 8, 0); + __m128 _p = __lsx_vfmul_s(_S0, _a0); + _p = __lsx_vfmadd_s(_a1, _S1, _p); + _p = __lsx_vfmadd_s(_a2, _S2, _p); + _p = __lsx_vfmadd_s(_a3, _S3, _p); + __lsx_vst(_p, outptr, 0); + + alphap += 4; + outptr += 4; + } + } + + delete[] buf; + } + + return 0; + } +#endif // __loongarch_sx + + if (resize_type == 1) // nearest + { + const float ws = output_width ? w / (float)outw : 1.f / width_scale; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const float* ptr = bottom_blob.row(y); + float* outptr = top_blob.row(y); + for (int x = 0; x < outw; x++) + { + int in_x = std::min((int)(x * ws), (w - 1)); + *outptr++ = ptr[in_x]; + } + } + } + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outw * 2]; + + int* xofs = buf; + float* alpha = (float*)(buf + outw); + + linear_coeffs(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const float* ptr = bottom_blob.row(y); + float* outptr = top_blob.row(y); + const float* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x]; + const float* Sp = ptr + sx; + float a0 = alphap[0]; + float a1 = alphap[1]; + *outptr++ = Sp[0] * a0 + Sp[1] * a1; + alphap += 2; + } + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outw * 4]; + + int* xofs = buf; + float* alpha = (float*)(buf + outw); + + cubic_coeffs(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const float* ptr = bottom_blob.row(y); + float* outptr = top_blob.row(y); + const float* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x]; + const float* Sp = ptr + sx; + float a0 = alphap[0]; + float a1 = alphap[1]; + float a2 = alphap[2]; + float a3 = alphap[3]; + *outptr++ = Sp[-1] * a0 + Sp[0] * a1 + Sp[1] * a2 + Sp[2] * a3; + alphap += 4; + } + } + + delete[] buf; + } + + return 0; + } + + if (outw == w && outh == h) + { + top_blob = bottom_blob; + return 0; + } + + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __loongarch_sx + if (elempack == 4) + { + if (resize_type == 1) // nearest + { + const float hs = output_height ? h / (float)outh : 1.f / height_scale; + const float ws = output_width ? w / (float)outw : 1.f / width_scale; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + for (int y = 0; y < outh; y++) + { + int in_y = std::min((int)(y * hs), (h - 1)); + + const float* ptr = src.row(in_y); + float* outptr = dst.row(y); + for (int x = 0; x < outw; x++) + { + int in_x = std::min((int)(x * ws), (w - 1)); + + __m128 _p = (__m128)__lsx_vld(ptr + in_x * 4, 0); + __lsx_vst(_p, outptr, 0); + + outptr += 4; + } + } + } + } + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outh + outw * 2 + outh * 2]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + float* alpha = (float*)(buf + outw + outh); //new float[outw * 2]; + float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2]; + + linear_coeffs(w, outw, xofs, alpha, align_corner); + linear_coeffs(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bilinear_image_pack4(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outh + outw * 4 + outh * 4]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + float* alpha = (float*)(buf + outw + outh); //new float[outw * 4]; + float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4]; + + cubic_coeffs(w, outw, xofs, alpha, align_corner); + cubic_coeffs(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bicubic_image_pack4(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + return 0; + } +#endif // __loongarch_sx + + if (resize_type == 1) // nearest + { + const float hs = output_height ? h / (float)outh : 1.f / height_scale; + const float ws = output_width ? w / (float)outw : 1.f / width_scale; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + for (int y = 0; y < outh; y++) + { + int in_y = std::min((int)(y * hs), (h - 1)); + + const float* ptr = src.row(in_y); + float* outptr = dst.row(y); + for (int x = 0; x < outw; x++) + { + int in_x = std::min((int)(x * ws), (w - 1)); + *outptr++ = ptr[in_x]; + } + } + } + } + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outh + outw * 2 + outh * 2]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + float* alpha = (float*)(buf + outw + outh); //new float[outw * 2]; + float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2]; + + linear_coeffs(w, outw, xofs, alpha, align_corner); + linear_coeffs(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bilinear_image(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outh + outw * 4 + outh * 4]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + float* alpha = (float*)(buf + outw + outh); //new float[outw * 4]; + float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4]; + + cubic_coeffs(w, outw, xofs, alpha, align_corner); + cubic_coeffs(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bicubic_image(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/interp_loongarch.h b/src/layer/loongarch/interp_loongarch.h new file mode 100644 index 000000000000..4c0e0f3dc86b --- /dev/null +++ b/src/layer/loongarch/interp_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_INTERP_LOONGARCH_H +#define LAYER_INTERP_LOONGARCH_H + +#include "interp.h" + +namespace ncnn { + +class Interp_loongarch : virtual public Interp +{ +public: + Interp_loongarch(); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_INTERP_LOONGARCH_H diff --git a/src/layer/loongarch/loongarch_activation.h b/src/layer/loongarch/loongarch_activation.h new file mode 100644 index 000000000000..abb268f4bb6d --- /dev/null +++ b/src/layer/loongarch/loongarch_activation.h @@ -0,0 +1,70 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LOONGARCH_ACTIVATION_H +#define LOONGARCH_ACTIVATION_H + +#include "fused_activation.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" + +static inline __m128 activation_ps(__m128 _v, int activation_type, const ncnn::Mat& activation_params) +{ + if (activation_type == 1) + { + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + _v = __lsx_vfmax_s(_v, _zero); + } + else if (activation_type == 2) + { + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(activation_params[0]); + __m128i _lemask = __lsx_vfcmp_cle_s(_v, _zero); + __m128 _ps = __lsx_vfmul_s(_v, _slope); + _v = (__m128)__lsx_vbitsel_v((__m128i)_v, (__m128i)_ps, (__m128i)_lemask); + } + else if (activation_type == 3) + { + __m128 _min = (__m128)__lsx_vreplfr2vr_s(activation_params[0]); + __m128 _max = (__m128)__lsx_vreplfr2vr_s(activation_params[1]); + _v = __lsx_vfmax_s(_v, _min); + _v = __lsx_vfmin_s(_v, _max); + } + else if (activation_type == 4) + { + _v = sigmoid_ps(_v); + } + else if (activation_type == 5) + { + _v = __lsx_vfmul_s(_v, tanh_ps(log_ps(__lsx_vfadd_s(exp_ps(_v), (__m128)__lsx_vreplfr2vr_s(1.f))))); + } + else if (activation_type == 6) + { + __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(activation_params[0]); + __m128 _beta = (__m128)__lsx_vreplfr2vr_s(activation_params[1]); + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f); + __m128 _outp = __lsx_vfmadd_s(_alpha, _v, _beta); + _outp = __lsx_vfmax_s(_outp, _zero); + _outp = __lsx_vfmin_s(_outp, _one); + _v = __lsx_vfmul_s(_outp, _v); + } + + return _v; +} +#endif // __loongarch_sx + +#endif // LOONGARCH_ACTIVATION_H diff --git a/src/layer/loongarch/loongarch_usability.h b/src/layer/loongarch/loongarch_usability.h new file mode 100644 index 000000000000..d3ae5dec279d --- /dev/null +++ b/src/layer/loongarch/loongarch_usability.h @@ -0,0 +1,236 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LOONGARCH_USABILITY_H +#define LOONGARCH_USABILITY_H + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include +#include + +namespace ncnn { + +typedef union +{ + int32_t i; + float f; +} FloatInt; + +} // namespace ncnn + +#if __loongarch_sx +/* declare some loongarch constants with union */ +#define _LOONGARCH_FLOAT_CONST(Name, Val) \ + static const ncnn::FloatInt Name = {.f = Val} + +/* float type data load instructions */ +static NCNN_FORCEINLINE __m128 __lsx_vreplfr2vr_s(float val) +{ + ncnn::FloatInt fi_tmpval = {.f = val}; + return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); +} + +static NCNN_FORCEINLINE float __lsx_reduce_fadd_s(__m128 _v) +{ + // TODO find a more efficient way + float* _v_p = (float*)&_v; + return _v_p[0] + _v_p[1] + _v_p[2] + _v_p[3]; +} + +static NCNN_FORCEINLINE int __lsx_reduce_add_w(__m128i _v) +{ + // TODO find a more efficient way + int* _v_p = (int*)&_v; + return _v_p[0] + _v_p[1] + _v_p[2] + _v_p[3]; +} + +#endif // __loongarch_sx + +static NCNN_FORCEINLINE signed char float2int8(float v) +{ + int int32 = round(v); + if (int32 > 127) return 127; + if (int32 < -127) return -127; + return (signed char)int32; +} + +#if __loongarch_sx +static NCNN_FORCEINLINE __m128i float2int8(__m128 _v) +{ + // simulate round to nearest via +/-0.5 + __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f); + __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31); + + __m128i _sign = __lsx_vand_v((__m128i)_v, _signmask); + __m128 _p5s = (__m128)__lsx_vor_v((__m128i)_p5, (__m128i)_sign); + __m128 _v5 = __lsx_vfadd_s(_v, _p5s); + __m128i _v32 = __lsx_vftintrz_w_s(_v5); + + __m128i _v32_16 = __lsx_vsat_w(_v32, 15); + __m128i _v16 = __lsx_vpickev_h(_v32_16, _v32_16); + _v16 = __lsx_vmax_h(_v16, __lsx_vreplgr2vr_h(-127)); + __m128i _v16_8 = __lsx_vsat_h(_v16, 7); + __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8); + + return _v8; +} + +static NCNN_FORCEINLINE int64_t float2int8(__m128 _vlow, __m128 _vhigh) +{ + // simulate round to nearest via +/-0.5 + __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f); + __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31); + + __m128i _signlow = __lsx_vand_v((__m128i)_vlow, _signmask); + __m128i _signhigh = __lsx_vand_v((__m128i)_vhigh, _signmask); + __m128 _p5low = (__m128)__lsx_vor_v((__m128i)_p5, _signlow); + __m128 _p5high = (__m128)__lsx_vor_v((__m128i)_p5, _signhigh); + __m128 _vlow5 = __lsx_vfadd_s(_vlow, _p5low); + __m128 _vhigh5 = __lsx_vfadd_s(_vhigh, _p5high); + __m128i _vlow32 = __lsx_vftintrz_w_s(_vlow5); + __m128i _vhigh32 = __lsx_vftintrz_w_s(_vhigh5); + + __m128i _vlow32_16 = __lsx_vsat_w(_vlow32, 15); + __m128i _vhigh32_16 = __lsx_vsat_w(_vhigh32, 15); + __m128i _v16 = __lsx_vpickev_h(_vhigh32_16, _vlow32_16); + _v16 = __lsx_vmax_h(_v16, __lsx_vreplgr2vr_h(-127)); + __m128i _v16_8 = __lsx_vsat_h(_v16, 7); + __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8); + + return _v8[0]; +} + +static NCNN_FORCEINLINE __m128i float2int8relu(__m128 _v) +{ + // simulate round to nearest via +/-0.5 + __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f); + __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31); + + __m128i _sign = __lsx_vand_v((__m128i)_v, _signmask); + __m128 _p5s = (__m128)__lsx_vor_v((__m128i)_p5, _sign); + __m128 _v5 = __lsx_vfadd_s(_v, _p5s); + __m128i _v32 = __lsx_vftintrz_w_s(_v5); + + __m128i _v32_16 = __lsx_vsat_w(_v32, 15); + __m128i _v16 = __lsx_vpickev_h(_v32_16, _v32_16); + _v16 = __lsx_vmaxi_h(_v16, 0); + __m128i _v16_8 = __lsx_vsat_h(_v16, 7); + __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8); + + return _v8; +} + +static NCNN_FORCEINLINE int64_t float2int8relu(__m128 _vlow, __m128 _vhigh) +{ + // simulate round to nearest via +/-0.5 + __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f); + __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31); + + __m128i _signlow = __lsx_vand_v((__m128i)_vlow, _signmask); + __m128i _signhigh = __lsx_vand_v((__m128i)_vhigh, _signmask); + __m128 _p5low = (__m128)__lsx_vor_v((__m128i)_p5, _signlow); + __m128 _p5high = (__m128)__lsx_vor_v((__m128i)_p5, _signhigh); + __m128 _vlow5 = __lsx_vfadd_s(_vlow, _p5low); + __m128 _vhigh5 = __lsx_vfadd_s(_vhigh, _p5high); + __m128i _vlow32 = __lsx_vftintrz_w_s(_vlow5); + __m128i _vhigh32 = __lsx_vftintrz_w_s(_vhigh5); + + __m128i _vlow32_16 = __lsx_vsat_w(_vlow32, 15); + __m128i _vhigh32_16 = __lsx_vsat_w(_vhigh32, 15); + __m128i _v16 = __lsx_vpickev_h(_vhigh32_16, _vlow32_16); + _v16 = __lsx_vmaxi_h(_v16, 0); + __m128i _v16_8 = __lsx_vsat_h(_v16, 7); + __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8); + + return _v8[0]; +} + +static NCNN_FORCEINLINE __m128i float2int8leakyrelu(__m128 _v, __m128 _slope) +{ + __m128 _v_leaky = __lsx_vfmul_s(_v, _slope); + + // simulate round to nearest via +/-0.5 + __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f); + __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31); + + __m128i _sign = __lsx_vand_v((__m128i)_v, _signmask); + __m128 _p5s = (__m128)__lsx_vor_v((__m128i)_p5, _sign); + __m128 _v5 = __lsx_vfadd_s(_v, _p5s); + __m128i _v32 = __lsx_vftintrz_w_s(_v5); + + __m128i _sign_leaky = __lsx_vand_v((__m128i)_v_leaky, _signmask); + __m128 _p5_leaky = (__m128)__lsx_vor_v((__m128i)_p5, _sign_leaky); + __m128 _v5_leaky = __lsx_vfadd_s(_v_leaky, _p5_leaky); + __m128i _v32_leaky = __lsx_vftintrz_w_s(_v5_leaky); + + __m128i _v32_16 = __lsx_vsat_w(_v32, 15); + __m128i _v16 = __lsx_vpickev_h(_v32_16, _v32_16); + + __m128i _v32_16_leaky = __lsx_vsat_w(_v32_leaky, 15); + __m128i _v16_leaky = __lsx_vpickev_h(_v32_16_leaky, _v32_16_leaky); + + _v16 = __lsx_vmax_h(_v16, _v16_leaky); + __m128i _v16_8 = __lsx_vsat_h(_v16, 7); + __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8); + + return _v8; +} + +static NCNN_FORCEINLINE int64_t float2int8leakyrelu(__m128 _vlow, __m128 _vhigh, __m128 _slope) +{ + __m128 _vlow_leaky = __lsx_vfmul_s(_vlow, _slope); + __m128 _vhigh_leaky = __lsx_vfmul_s(_vhigh, _slope); + + // simulate round to nearest via +/-0.5 + __m128i _p5 = (__m128i)__lsx_vreplfr2vr_s(0.5f); + __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31); + + __m128i _signlow = __lsx_vand_v((__m128i)_vlow, _signmask); + __m128i _signhigh = __lsx_vand_v((__m128i)_vhigh, _signmask); + __m128 _p5low = (__m128)__lsx_vor_v(_p5, _signlow); + __m128 _p5high = (__m128)__lsx_vor_v(_p5, _signhigh); + __m128 _vlow5 = __lsx_vfadd_s(_vlow, _p5low); + __m128 _vhigh5 = __lsx_vfadd_s(_vhigh, _p5high); + __m128i _vlow32 = __lsx_vftintrz_w_s(_vlow5); + __m128i _vhigh32 = __lsx_vftintrz_w_s(_vhigh5); + + __m128i _signlow_leaky = __lsx_vand_v((__m128i)_vlow_leaky, _signmask); + __m128i _signhigh_leaky = __lsx_vand_v((__m128i)_vhigh_leaky, _signmask); + __m128 _p5low_leaky = (__m128)__lsx_vor_v(_p5, _signlow_leaky); + __m128 _p5high_leaky = (__m128)__lsx_vor_v(_p5, _signhigh_leaky); + __m128 _vlow5_leaky = __lsx_vfadd_s(_vlow_leaky, _p5low_leaky); + __m128 _vhigh5_leaky = __lsx_vfadd_s(_vhigh_leaky, _p5high_leaky); + __m128i _vlow32_leaky = __lsx_vftintrz_w_s(_vlow5_leaky); + __m128i _vhigh32_leaky = __lsx_vftintrz_w_s(_vhigh5_leaky); + + __m128i _vlow32_16 = __lsx_vsat_w(_vlow32, 15); + __m128i _vhigh32_16 = __lsx_vsat_w(_vhigh32, 15); + __m128i _v16 = __lsx_vpickev_h(_vhigh32_16, _vlow32_16); + + __m128i _vlow32_16_leaky = __lsx_vsat_w(_vlow32_leaky, 15); + __m128i _vhigh32_16_leaky = __lsx_vsat_w(_vhigh32_leaky, 15); + __m128i _v16_leaky = __lsx_vpickev_h(_vhigh32_16_leaky, _vlow32_16_leaky); + + _v16 = __lsx_vmax_h(_v16, _v16_leaky); + __m128i _v16_8 = __lsx_vsat_h(_v16, 7); + __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8); + + return _v8[0]; +} +#endif // __loongarch_sx + +#endif // LOONGARCH_USABILITY_H diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h new file mode 100644 index 000000000000..ededa5966593 --- /dev/null +++ b/src/layer/loongarch/lsx_mathfun.h @@ -0,0 +1,258 @@ +/* LOONGARCH implementation of exp + * + * Inspired by Intel Approximate Math library, and based on the + * corresponding algorithms of the cephes math library + * Copyright (C) 2022 yala ;. All rights reserved. + */ + +/* + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * (this is the zlib license) + */ + +#ifndef LSX_MATHFUN_H +#define LSX_MATHFUN_H + +#include "loongarch_usability.h" + +#include + +_LOONGARCH_FLOAT_CONST(c_1, 1.0f); +_LOONGARCH_FLOAT_CONST(c_2, 2.0f); +_LOONGARCH_FLOAT_CONST(c_n1, -1.0f); +_LOONGARCH_FLOAT_CONST(c_0p5, 0.5f); + +#define c_inv_mant_mask ~0x7f800000u +_LOONGARCH_FLOAT_CONST(c_cephes_SQRTHF, 0.707106781186547524); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p0, 7.0376836292E-2); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p1, -1.1514610310E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p2, 1.1676998740E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p3, -1.2420140846E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p4, +1.4249322787E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p5, -1.6668057665E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p6, +2.0000714765E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p7, -2.4999993993E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p8, +3.3333331174E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_log_q1, -2.12194440e-4); +_LOONGARCH_FLOAT_CONST(c_cephes_log_q2, 0.693359375); + +/* natural logarithm computed for 4 simultaneous float + * return NaN for x <= 0 + */ +static inline __m128 log_ps(__m128 x) +{ + __m128 one = (__m128)__lsx_vreplgr2vr_w(c_1.i); + + x = __lsx_vfmax_s(x, (__m128)__lsx_vreplgr2vr_w(0)); /* force flush to zero on denormal values */ + __m128i invalid_mask = __lsx_vfcmp_cle_s(x, (__m128)__lsx_vreplgr2vr_w(0)); + + __m128i ux = (__m128i)(x); + + __m128i emm0 = __lsx_vsrl_w(ux, __lsx_vreplgr2vr_w(23)); + + /* keep only the fractional part */ + ux = __lsx_vand_v(ux, __lsx_vreplgr2vr_w(c_inv_mant_mask)); + ux = __lsx_vor_v(ux, __lsx_vreplgr2vr_w(c_0p5.i)); + x = (__m128)(ux); + + emm0 = __lsx_vsub_w(emm0, __lsx_vreplgr2vr_w(0x7f)); + __m128 e = __lsx_vffint_s_w(emm0); + + e = __lsx_vfadd_s(e, one); + + /* part2: + * if( x < SQRTHF ) { + * e -= 1; + * x = x + x - 1.0; + * } else { x = x - 1.0; } + */ + __m128i mask = __lsx_vfcmp_clt_s((__m128)x, (__m128)__lsx_vreplgr2vr_w(c_cephes_SQRTHF.i)); + __m128 tmp = (__m128)(__lsx_vand_v((__m128i)(x), (__m128i)mask)); + x = __lsx_vfsub_s(x, one); + e = __lsx_vfsub_s(e, (__m128)(__lsx_vand_v((__m128i)(one), (__m128i)mask))); + x = __lsx_vfadd_s(x, tmp); + + __m128 z = __lsx_vfmul_s(x, x); + + __m128 y = (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p0.i); + + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p1.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p2.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p3.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p4.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p5.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p6.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p7.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p8.i)); + y = __lsx_vfmul_s(y, x); + + y = __lsx_vfmul_s(y, z); + + tmp = __lsx_vfmul_s(e, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_q1.i)); + y = __lsx_vfadd_s(y, tmp); + + tmp = __lsx_vfmul_s(z, (__m128)__lsx_vreplgr2vr_w(c_0p5.i)); + y = __lsx_vfsub_s(y, tmp); + + tmp = __lsx_vfmul_s(e, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_q2.i)); + x = __lsx_vfadd_s(x, y); + x = __lsx_vfadd_s(x, tmp); + x = (__m128)(__lsx_vor_v((__m128i)(x), (__m128i)invalid_mask)); // negative arg will be NAN + return x; +} + +_LOONGARCH_FLOAT_CONST(c_exp_hi, 88.3762626647949f); +_LOONGARCH_FLOAT_CONST(c_exp_lo, -88.3762626647949f); + +_LOONGARCH_FLOAT_CONST(c_cephes_LOG2EF, 1.44269504088896341); +_LOONGARCH_FLOAT_CONST(c_cephes_exp_C1, 0.693359375); +_LOONGARCH_FLOAT_CONST(c_cephes_exp_C2, -2.12194440e-4); + +_LOONGARCH_FLOAT_CONST(c_cephes_exp_p0, 1.9875691500E-4); +_LOONGARCH_FLOAT_CONST(c_cephes_exp_p1, 1.3981999507E-3); +_LOONGARCH_FLOAT_CONST(c_cephes_exp_p2, 8.3334519073E-3); +_LOONGARCH_FLOAT_CONST(c_cephes_exp_p3, 4.1665795894E-2); +_LOONGARCH_FLOAT_CONST(c_cephes_exp_p4, 1.6666665459E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_exp_p5, 5.0000001201E-1); + +/* exp() computed for 4 float at once */ +static inline __m128 exp_ps(__m128 x) +{ + __m128 tmp, fx; + + __m128 one = (__m128)__lsx_vreplgr2vr_w(c_1.i); + x = __lsx_vfmin_s(x, (__m128)__lsx_vreplgr2vr_w(c_exp_hi.i)); + x = __lsx_vfmax_s(x, (__m128)__lsx_vreplgr2vr_w(c_exp_lo.i)); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = __lsx_vfmul_s(x, (__m128)__lsx_vreplgr2vr_w(c_cephes_LOG2EF.i)); + fx = __lsx_vfadd_s(fx, (__m128)__lsx_vreplgr2vr_w(c_0p5.i)); + + /* perform a floorf */ + tmp = __lsx_vffint_s_w(__lsx_vftint_w_s(fx)); + + /* if greater, substract 1 */ + __m128i mask = __lsx_vfcmp_clt_s(fx, tmp); + mask = __lsx_vand_v(mask, (__m128i)one); + + fx = __lsx_vfsub_s(tmp, (__m128)mask); + + tmp = __lsx_vfmul_s(fx, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_C1.i)); + __m128 z = __lsx_vfmul_s(fx, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_C2.i)); + x = __lsx_vfsub_s(x, tmp); + x = __lsx_vfsub_s(x, z); + + __m128 y = (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p0.i); + + z = __lsx_vfmul_s(x, x); + + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p1.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p2.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p3.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p4.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p5.i)); + + y = __lsx_vfmul_s(y, z); + y = __lsx_vfadd_s(y, x); + y = __lsx_vfadd_s(y, one); + + /* build 2^n */ + __m128i mm; + mm = __lsx_vftintrz_w_s(fx); + mm = __lsx_vadd_w(mm, __lsx_vreplgr2vr_w(0x7f)); + mm = __lsx_vsll_w(mm, __lsx_vreplgr2vr_w(23)); + + y = __lsx_vfmul_s(y, (__m128)mm); + return y; +} + +_LOONGARCH_FLOAT_CONST(c_tanh_tiny, 1e-4f); +_LOONGARCH_FLOAT_CONST(c_tanh_hi, 9.0f); +// The monomial coefficients of the numerator polynomial (odd). +_LOONGARCH_FLOAT_CONST(c_tanh_alpha_1, 4.89352455891786e-3f); +_LOONGARCH_FLOAT_CONST(c_tanh_alpha_3, 6.37261928875436e-4f); +_LOONGARCH_FLOAT_CONST(c_tanh_alpha_5, 1.48572235717979e-5f); +_LOONGARCH_FLOAT_CONST(c_tanh_alpha_7, 5.12229709037114e-8f); +_LOONGARCH_FLOAT_CONST(c_tanh_alpha_9, -8.60467152213735e-11f); +_LOONGARCH_FLOAT_CONST(c_tanh_alpha_11, 2.00018790482477e-13f); +_LOONGARCH_FLOAT_CONST(c_tanh_alpha_13, -2.76076847742355e-16f); +// The monomial coefficients of the denominator polynomial (even). +_LOONGARCH_FLOAT_CONST(c_tanh_beta_0, 4.89352518554385e-3f); +_LOONGARCH_FLOAT_CONST(c_tanh_beta_2, 2.26843463243900e-3f); +_LOONGARCH_FLOAT_CONST(c_tanh_beta_4, 1.18534705686654e-4f); +_LOONGARCH_FLOAT_CONST(c_tanh_beta_6, 1.19825839466702e-6f); + +/* tanh() computed for 4 float at once */ +static inline __m128 tanh_ps(__m128 x) +{ + __m128 x2 = (__m128)__lsx_vbitclri_w((__m128i)x, 31); + __m128i tiny_mask = __lsx_vfcmp_clt_s((__m128)x2, (__m128)(__m128)__lsx_vreplgr2vr_w(c_tanh_tiny.i)); + __m128i sig_mask = __lsx_vreplgr2vr_w(1 << 31); + __m128i sig_save = __lsx_vand_v((__m128i)x, sig_mask); + + // clamp the inputs to the range [-9, 9] since anything outside + // this range is -/+1.0f in single-precision. + x2 = (__m128)__lsx_vbitsel_v((__m128i)x2, (__m128i)__lsx_vreplgr2vr_w(c_tanh_hi.i), (__m128i)__lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(c_tanh_hi.i), (__m128)x2)); + + // since the polynomials are odd/even, we need x**2. + __m128 z = __lsx_vfmul_s(x2, x2); + + // evaluate the numerator polynomial y. + __m128 y = (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_13.i); + y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_11.i)); + y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_9.i)); + y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_7.i)); + y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_5.i)); + y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_3.i)); + y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_1.i)); + y = __lsx_vfmul_s(y, x2); + + // evaluate the denominator polynomial w. + __m128 w = (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_6.i); + w = __lsx_vfmadd_s(z, w, (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_4.i)); + w = __lsx_vfmadd_s(z, w, (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_2.i)); + w = __lsx_vfmadd_s(z, w, (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_0.i)); + + // divide the numerator by the denominator. + y = __lsx_vfdiv_s(y, w); + + // reinstate the sign. + y = (__m128)__lsx_vor_v((__m128i)y, sig_save); + + // when the argument is very small in magnitude it's more accurate to just return it. + y = (__m128)__lsx_vbitsel_v((__m128i)y, (__m128i)x, (__m128i)tiny_mask); + + return y; +} + +static inline __m128 pow_ps(__m128 a, __m128 b) +{ + // pow(x, m) = exp(m * log(x)) + return exp_ps(__lsx_vfmul_s(b, log_ps(a))); +} + +static inline __m128 sigmoid_ps(__m128 _v) +{ + __m128 _one = __lsx_vreplfr2vr_s(1.f); + _v = (__m128)__lsx_vbitrevi_w((__m128i)_v, 31); + _v = exp_ps(_v); + _v = __lsx_vfadd_s(_v, _one); + return __lsx_vfdiv_s(_one, _v); +} + +#endif // LSX_MATHFUN_H diff --git a/src/layer/loongarch/mish_loongarch.cpp b/src/layer/loongarch/mish_loongarch.cpp new file mode 100644 index 000000000000..8558e2f8cb06 --- /dev/null +++ b/src/layer/loongarch/mish_loongarch.cpp @@ -0,0 +1,70 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "mish_loongarch.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +#include + +namespace ncnn { + +Mish_loongarch::Mish_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int Mish_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmul_s(_p, tanh_ps(log_ps(__lsx_vfadd_s(exp_ps(_p), _one)))); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = *ptr * tanh(log(exp(*ptr) + 1.f)); + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/mish_loongarch.h b/src/layer/loongarch/mish_loongarch.h new file mode 100644 index 000000000000..97c6f0520f50 --- /dev/null +++ b/src/layer/loongarch/mish_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_MISH_LOONGARCH_H +#define LAYER_MISH_LOONGARCH_H + +#include "mish.h" + +namespace ncnn { + +class Mish_loongarch : virtual public Mish +{ +public: + Mish_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_MISH_LOONGARCH_H diff --git a/src/layer/loongarch/packing_loongarch.cpp b/src/layer/loongarch/packing_loongarch.cpp new file mode 100644 index 000000000000..cf68b7b34d69 --- /dev/null +++ b/src/layer/loongarch/packing_loongarch.cpp @@ -0,0 +1,569 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "packing_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +namespace ncnn { + +Packing_loongarch::Packing_loongarch() +{ + support_packing = true; +} + +int Packing_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int elembits = bottom_blob.elembits(); + + if (elembits == 8) + return forward_int8(bottom_blob, top_blob, opt); + + if (use_padding) + { + return Packing::forward(bottom_blob, top_blob, opt); + } + + if (elembits != 32) + { + // non-fp32 type + return Packing::forward(bottom_blob, top_blob, opt); + } + + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + if (elempack == out_elempack) + { + top_blob = bottom_blob; + return 0; + } + + bool pack1to4 = elempack == 1 && out_elempack == 4; + bool pack4to1 = elempack == 4 && out_elempack == 1; + + if (!pack1to4 && !pack4to1) + { + return Packing::forward(bottom_blob, top_blob, opt); + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + + if (!use_padding) + { + // identity if use_padding not allowed + if (dims == 1 && w * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + if (dims == 2 && h * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + if ((dims == 3 || dims == 4) && channels * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + } + + if (dims == 1) + { + top_blob = bottom_blob; + top_blob.w = w * elempack / out_elempack; + top_blob.cstep = w * elempack / out_elempack; + top_blob.elemsize = elemsize / elempack * out_elempack; + top_blob.elempack = out_elempack; + return 0; + } + + if (dims == 2) + { + int outh = h * elempack / out_elempack; + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (pack1to4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* r0 = bottom_blob.row(i * 4); + const float* r1 = bottom_blob.row(i * 4 + 1); + const float* r2 = bottom_blob.row(i * 4 + 2); + const float* r3 = bottom_blob.row(i * 4 + 3); + + float* outptr = top_blob.row(i); + + int j = 0; +#if __loongarch_sx + for (; j + 3 < w; j += 4) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r1, 0); + __m128i _r2 = __lsx_vld(r2, 0); + __m128i _r3 = __lsx_vld(r3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, outptr, 0); + __lsx_vst(_r0123_1, outptr + 4, 0); + __lsx_vst(_r0123_2, outptr + 4 * 2, 0); + __lsx_vst(_r0123_3, outptr + 4 * 3, 0); + + r0 += 4; + r1 += 4; + r2 += 4; + r3 += 4; + outptr += 16; + } +#endif // __loongarch_sx + for (; j < w; j++) + { + outptr[0] = *r0++; + outptr[1] = *r1++; + outptr[2] = *r2++; + outptr[3] = *r3++; + + outptr += 4; + } + } + } + if (pack4to1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* r0 = bottom_blob.row(i); + + float* outptr0 = top_blob.row(i * 4); + float* outptr1 = top_blob.row(i * 4 + 1); + float* outptr2 = top_blob.row(i * 4 + 2); + float* outptr3 = top_blob.row(i * 4 + 3); + + int j = 0; +#if __loongarch_sx + for (; j + 3 < w; j += 4) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r0 + 4, 0); + __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, outptr0, 0); + __lsx_vst(_r0123_1, outptr1, 0); + __lsx_vst(_r0123_2, outptr2, 0); + __lsx_vst(_r0123_3, outptr3, 0); + + r0 += 16; + outptr0 += 4; + outptr1 += 4; + outptr2 += 4; + outptr3 += 4; + } +#endif // __loongarch_sx + for (; j < w; j++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; + + r0 += 4; + } + } + } + + return 0; + } + + if (dims == 3 || dims == 4) + { + int size = w * h * d; + int outc = channels * elempack / out_elempack; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (dims == 3) + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); + else // if (dims == 4) + top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (pack1to4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* r0 = bottom_blob.channel(q * 4); + const float* r1 = bottom_blob.channel(q * 4 + 1); + const float* r2 = bottom_blob.channel(q * 4 + 2); + const float* r3 = bottom_blob.channel(q * 4 + 3); + + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r1, 0); + __m128i _r2 = __lsx_vld(r2, 0); + __m128i _r3 = __lsx_vld(r3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, outptr, 0); + __lsx_vst(_r0123_1, outptr + 4, 0); + __lsx_vst(_r0123_2, outptr + 4 * 2, 0); + __lsx_vst(_r0123_3, outptr + 4 * 3, 0); + + r0 += 4; + r1 += 4; + r2 += 4; + r3 += 4; + outptr += 16; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + outptr[0] = *r0++; + outptr[1] = *r1++; + outptr[2] = *r2++; + outptr[3] = *r3++; + + outptr += 4; + } + } + } + if (pack4to1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* r0 = bottom_blob.channel(q); + + float* outptr0 = top_blob.channel(q * 4); + float* outptr1 = top_blob.channel(q * 4 + 1); + float* outptr2 = top_blob.channel(q * 4 + 2); + float* outptr3 = top_blob.channel(q * 4 + 3); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r0 + 4, 0); + __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, outptr0, 0); + __lsx_vst(_r0123_1, outptr1, 0); + __lsx_vst(_r0123_2, outptr2, 0); + __lsx_vst(_r0123_3, outptr3, 0); + + r0 += 16; + outptr0 += 4; + outptr1 += 4; + outptr2 += 4; + outptr3 += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; + + r0 += 4; + } + } + } + + return 0; + } + + return 0; +} + +int Packing_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + if (use_padding) + { + return Packing::forward(bottom_blob, top_blob, opt); + } + + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + if (elempack == out_elempack) + { + top_blob = bottom_blob; + return 0; + } + + bool pack1to8 = elempack == 1 && out_elempack == 8; + bool pack8to1 = elempack == 8 && out_elempack == 1; + + if (!pack1to8 && !pack8to1) + { + return Packing::forward(bottom_blob, top_blob, opt); + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + + if (!use_padding) + { + // identity if use_padding not allowed + if (dims == 1 && w * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + if (dims == 2 && h * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + if ((dims == 3 || dims == 4) && channels * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + } + + if (dims == 1) + { + top_blob = bottom_blob; + top_blob.w = w * elempack / out_elempack; + top_blob.cstep = w * elempack / out_elempack; + top_blob.elemsize = elemsize / elempack * out_elempack; + top_blob.elempack = out_elempack; + return 0; + } + + if (dims == 2) + { + int outh = h * elempack / out_elempack; + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (pack1to8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const signed char* r0 = bottom_blob.row(i * 8); + const signed char* r1 = bottom_blob.row(i * 8 + 1); + const signed char* r2 = bottom_blob.row(i * 8 + 2); + const signed char* r3 = bottom_blob.row(i * 8 + 3); + const signed char* r4 = bottom_blob.row(i * 8 + 4); + const signed char* r5 = bottom_blob.row(i * 8 + 5); + const signed char* r6 = bottom_blob.row(i * 8 + 6); + const signed char* r7 = bottom_blob.row(i * 8 + 7); + + signed char* outptr = top_blob.row(i); + + int j = 0; + for (; j < w; j++) + { + outptr[0] = *r0++; + outptr[1] = *r1++; + outptr[2] = *r2++; + outptr[3] = *r3++; + outptr[4] = *r4++; + outptr[5] = *r5++; + outptr[6] = *r6++; + outptr[7] = *r7++; + + outptr += 8; + } + } + } + if (pack8to1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const signed char* r0 = bottom_blob.row(i); + + signed char* outptr0 = top_blob.row(i * 8); + signed char* outptr1 = top_blob.row(i * 8 + 1); + signed char* outptr2 = top_blob.row(i * 8 + 2); + signed char* outptr3 = top_blob.row(i * 8 + 3); + signed char* outptr4 = top_blob.row(i * 8 + 4); + signed char* outptr5 = top_blob.row(i * 8 + 5); + signed char* outptr6 = top_blob.row(i * 8 + 6); + signed char* outptr7 = top_blob.row(i * 8 + 7); + + int j = 0; + for (; j < w; j++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; + *outptr4++ = r0[4]; + *outptr5++ = r0[5]; + *outptr6++ = r0[6]; + *outptr7++ = r0[7]; + + r0 += 8; + } + } + } + + return 0; + } + + if (dims == 3 || dims == 4) + { + int size = w * h * d; + int outc = channels * elempack / out_elempack; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (dims == 3) + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); + else // if (dims == 4) + top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (pack1to8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const signed char* r0 = bottom_blob.channel(q * 8); + const signed char* r1 = bottom_blob.channel(q * 8 + 1); + const signed char* r2 = bottom_blob.channel(q * 8 + 2); + const signed char* r3 = bottom_blob.channel(q * 8 + 3); + const signed char* r4 = bottom_blob.channel(q * 8 + 4); + const signed char* r5 = bottom_blob.channel(q * 8 + 5); + const signed char* r6 = bottom_blob.channel(q * 8 + 6); + const signed char* r7 = bottom_blob.channel(q * 8 + 7); + + signed char* outptr = top_blob.channel(q); + + int i = 0; + for (; i < size; i++) + { + outptr[0] = *r0++; + outptr[1] = *r1++; + outptr[2] = *r2++; + outptr[3] = *r3++; + outptr[4] = *r4++; + outptr[5] = *r5++; + outptr[6] = *r6++; + outptr[7] = *r7++; + + outptr += 8; + } + } + } + if (pack8to1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const signed char* r0 = bottom_blob.channel(q); + + signed char* outptr0 = top_blob.channel(q * 8); + signed char* outptr1 = top_blob.channel(q * 8 + 1); + signed char* outptr2 = top_blob.channel(q * 8 + 2); + signed char* outptr3 = top_blob.channel(q * 8 + 3); + signed char* outptr4 = top_blob.channel(q * 8 + 4); + signed char* outptr5 = top_blob.channel(q * 8 + 5); + signed char* outptr6 = top_blob.channel(q * 8 + 6); + signed char* outptr7 = top_blob.channel(q * 8 + 7); + + int i = 0; + for (; i < size; i++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; + *outptr4++ = r0[4]; + *outptr5++ = r0[5]; + *outptr6++ = r0[6]; + *outptr7++ = r0[7]; + + r0 += 8; + } + } + } + + return 0; + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/packing_loongarch.h b/src/layer/loongarch/packing_loongarch.h new file mode 100644 index 000000000000..1db215cfee7a --- /dev/null +++ b/src/layer/loongarch/packing_loongarch.h @@ -0,0 +1,35 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_PACKING_LOONGARCH_H +#define LAYER_PACKING_LOONGARCH_H + +#include "packing.h" + +namespace ncnn { + +class Packing_loongarch : virtual public Packing +{ +public: + Packing_loongarch(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +protected: + int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_PACKING_LOONGARCH_H diff --git a/src/layer/loongarch/padding_loongarch.cpp b/src/layer/loongarch/padding_loongarch.cpp new file mode 100644 index 000000000000..1f345ce60532 --- /dev/null +++ b/src/layer/loongarch/padding_loongarch.cpp @@ -0,0 +1,385 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "padding_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +#if __loongarch_sx +#include "padding_pack4.h" +#include "padding_pack8_int8.h" +#endif // __loongarch_sx + +Padding_loongarch::Padding_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Padding_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + if (top == 0 && bottom == 0 && left == 0 && right == 0 && front == 0 && behind == 0) + { + top_blob = bottom_blob; + return 0; + } + + int elembits = bottom_blob.elembits(); + + if (elembits == 8) + return forward_int8(bottom_blob, top_blob, opt); + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + +#if __loongarch_sx + if (elempack == 4) + { + if (dims == 1) + { + int outw = w * elempack + left + right; + + int out_elempack = outw % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (left % 4 == 0 && out_elempack == 4 && type == 0) + { + top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + __m128 pad_value = __lsx_vreplfr2vr_s(value); + padding_constant_pack4_lsx(bottom_blob, top_blob, 0, 0, left / 4, right / 4, pad_value); + + return 0; + } + } + + if (dims == 2) + { + int outw = w + left + right; + int outh = h * elempack + top + bottom; + + int out_elempack = outh % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (top % 4 == 0 && out_elempack == 4 && type == 0) + { + top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + __m128 pad_value = __lsx_vreplfr2vr_s(value); + padding_constant_pack4_lsx(bottom_blob, top_blob, top / 4, bottom / 4, left, right, pad_value); + + return 0; + } + } + + if (dims == 3) + { + int outw = w + left + right; + int outh = h + top + bottom; + int outc = channels * elempack + front + behind; + + int out_elempack = outc % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (front % 4 == 0 && out_elempack == 4 && !(outc != channels * elempack && type != 0)) + { + top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int front_ = front / elempack; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc / out_elempack; q++) + { + Mat borderm = top_blob.channel(q); + + __m128 pad_value = per_channel_pad_data_size ? (__m128)__lsx_vld((const float*)per_channel_pad_data + q * 4, 0) : __lsx_vreplfr2vr_s(value); + //Channel padding + if ((q - front_) < 0 || (q - front_) >= channels) + { + borderm.fill(pad_value); + } + else + { + const Mat m = bottom_blob.channel(q - front_); + if (type == 0) + padding_constant_pack4_lsx(m, borderm, top, bottom, left, right, pad_value); + if (type == 1) + padding_replicate_pack4_lsx(m, borderm, top, bottom, left, right); + if (type == 2) + padding_reflect_pack4_lsx(m, borderm, top, bottom, left, right); + } + } + + return 0; + } + } + + if (dims == 4) + { + int outw = w + left + right; + int outh = h + top + bottom; + int outd = d + front + behind; + + if (type == 0) + { + top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __m128 pad_value = per_channel_pad_data_size ? (__m128)__lsx_vld((const float*)per_channel_pad_data + q * 4, 0) : __lsx_vreplfr2vr_s(value); + + for (int z = 0; z < outd; z++) + { + Mat borderm = top_blob.channel(q).depth(z); + + // depth padding + if ((z - front) < 0 || (z - front) >= d) + { + borderm.fill(pad_value); + } + else + { + const Mat m = bottom_blob.channel(q).depth(z - front); + padding_constant_pack4_lsx(m, borderm, top, bottom, left, right, pad_value); + } + } + } + + return 0; + } + } + } +#endif // __loongarch_sx + + Mat bottom_blob_unpacked = bottom_blob; + if (elempack != 1) + { + Option opt_pack1 = opt; + opt_pack1.blob_allocator = opt.workspace_allocator; + + convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); + } + + Mat top_blob_unpacked; + int ret = Padding::forward(bottom_blob_unpacked, top_blob_unpacked, opt); + if (ret != 0) + return ret; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = top_blob_unpacked.c % 4 == 0 ? 4 : 1; + } +#endif + + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + + return 0; +} + +int Padding_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + +#if __loongarch_sx + if (elempack == 8) + { + if (dims == 1) + { + int outw = w * elempack + left + right; + + int out_elempack = outw % 8 == 0 ? 8 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (left % 8 == 0 && out_elempack == 8 && type == 0) + { + top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int64_t v8 = (int64_t)value; + int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56); + padding_constant_pack8_int8_lsx(bottom_blob, top_blob, 0, 0, left / 8, right / 8, pad_value); + + return 0; + } + } + + if (dims == 2) + { + int outw = w + left + right; + int outh = h * elempack + top + bottom; + + int out_elempack = outh % 8 == 0 ? 8 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (top % 8 == 0 && out_elempack == 8 && type == 0) + { + top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int64_t v8 = (int64_t)value; + int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56); + padding_constant_pack8_int8_lsx(bottom_blob, top_blob, top / 8, bottom / 8, left, right, pad_value); + + return 0; + } + } + + if (dims == 3) + { + int outw = w + left + right; + int outh = h + top + bottom; + int outc = channels * elempack + front + behind; + + int out_elempack = outc % 8 == 0 ? 8 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (front % 8 == 0 && out_elempack == 8 && !(outc != channels * elempack && type != 0)) + { + top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int front_ = front / elempack; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc / out_elempack; q++) + { + Mat borderm = top_blob.channel(q); + + // TODO perchannel + // int64_t pad_value = per_channel_pad_data_size ? vld1_s8(per_channel_pad_data + q * 8) : vdup_n_s8((signed char)value); + int64_t v8 = (int64_t)value; + int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56); + + //Channel padding + if ((q - front_) < 0 || (q - front_) >= channels) + { + borderm.fill(pad_value); + } + else + { + const Mat m = bottom_blob.channel(q - front_); + if (type == 0) + padding_constant_pack8_int8_lsx(m, borderm, top, bottom, left, right, pad_value); + if (type == 1) + padding_replicate_pack8_int8_lsx(m, borderm, top, bottom, left, right); + if (type == 2) + padding_reflect_pack8_int8_lsx(m, borderm, top, bottom, left, right); + } + } + + return 0; + } + } + + if (dims == 4) + { + int outw = w + left + right; + int outh = h + top + bottom; + int outd = d + front + behind; + + if (type == 0) + { + top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + // TODO perchannel + // int64_t pad_value = per_channel_pad_data_size ? vld1_s8(per_channel_pad_data + q * 8) : vdup_n_s8((signed char)value); + int64_t v8 = (int64_t)value; + int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56); + + for (int z = 0; z < outd; z++) + { + Mat borderm = top_blob.channel(q).depth(z); + + // depth padding + if ((z - front) < 0 || (z - front) >= d) + { + borderm.fill(pad_value); + } + else + { + const Mat m = bottom_blob.channel(q).depth(z - front); + padding_constant_pack8_int8_lsx(m, borderm, top, bottom, left, right, pad_value); + } + } + } + + return 0; + } + } + } +#endif // __loongarch_sx + + Mat bottom_blob_unpacked = bottom_blob; + if (elempack != 1) + { + Option opt_pack1 = opt; + opt_pack1.blob_allocator = opt.workspace_allocator; + + convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); + } + + Mat top_blob_unpacked; + int ret = Padding::forward(bottom_blob_unpacked, top_blob_unpacked, opt); + if (ret != 0) + return ret; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = top_blob_unpacked.c % 8 == 0 ? 8 : 1; + } +#endif + + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/padding_loongarch.h b/src/layer/loongarch/padding_loongarch.h new file mode 100644 index 000000000000..137fbc4459ec --- /dev/null +++ b/src/layer/loongarch/padding_loongarch.h @@ -0,0 +1,35 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_PADDING_LOONGARCH_H +#define LAYER_PADDING_LOONGARCH_H + +#include "padding.h" + +namespace ncnn { + +class Padding_loongarch : virtual public Padding +{ +public: + Padding_loongarch(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +protected: + int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_PADDING_LOONGARCH_H diff --git a/src/layer/loongarch/padding_pack4.h b/src/layer/loongarch/padding_pack4.h new file mode 100644 index 000000000000..d040ce778b58 --- /dev/null +++ b/src/layer/loongarch/padding_pack4.h @@ -0,0 +1,213 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void padding_constant_pack4_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right, __m128 v) +{ + const float* ptr = src; + float* outptr = dst; + int top_size = top * dst.w; + int bottom_size = bottom * dst.w; + + // fill top + for (int y = 0; y < top_size; y++) + { + __lsx_vst(v, outptr, 0); + outptr += 4; + } + // fill center + for (int y = 0; y < src.h; y++) + { + for (int x = 0; x < left; x++) + { + __lsx_vst(v, outptr, 0); + outptr += 4; + } + for (int x = 0; x < src.w; x++) + { + __builtin_prefetch(ptr + 32); + __lsx_vst(__lsx_vld(ptr, 0), outptr, 0); + ptr += 4; + outptr += 4; + } + for (int x = 0; x < right; x++) + { + __lsx_vst(v, outptr, 0); + outptr += 4; + } + } + // fill top + for (int y = 0; y < bottom_size; y++) + { + __lsx_vst(v, outptr, 0); + outptr += 4; + } +} + +static void padding_replicate_pack4_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right) +{ + const float* ptr = src; + float* outptr = dst; + + // fill top + for (int y = 0; y < top; y++) + { + const float* ptr0 = ptr; + __m128 _p = (__m128)__lsx_vld(ptr0, 0); + for (int x = 0; x < left; x++) + { + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + for (int x = 0; x < src.w; x++) + { + _p = (__m128)__lsx_vld(ptr0, 0); + __lsx_vst(_p, outptr, 0); + ptr0 += 4; + outptr += 4; + } + for (int x = 0; x < right; x++) + { + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + } + // fill center + for (int y = 0; y < src.h; y++) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + for (int x = 0; x < left; x++) + { + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + for (int x = 0; x < src.w; x++) + { + _p = (__m128)__lsx_vld(ptr, 0); + __lsx_vst(_p, outptr, 0); + ptr += 4; + outptr += 4; + } + for (int x = 0; x < right; x++) + { + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + } + // fill bottom + ptr -= src.w * 4; + for (int y = 0; y < bottom; y++) + { + const float* ptr0 = ptr; + __m128 _p = (__m128)__lsx_vld(ptr0, 0); + for (int x = 0; x < left; x++) + { + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + for (int x = 0; x < src.w; x++) + { + _p = (__m128)__lsx_vld(ptr0, 0); + __lsx_vst(_p, outptr, 0); + ptr0 += 4; + outptr += 4; + } + for (int x = 0; x < right; x++) + { + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + } +} + +static void padding_reflect_pack4_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right) +{ + const float* ptr = src; + float* outptr = dst; + + // fill top + ptr += top * src.w * 4; + for (int y = 0; y < top; y++) + { + const float* ptr0 = ptr; + for (int x = 0; x < left; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr0 + (left - x) * 4, 0); + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + for (int x = 0; x < src.w; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr0, 0); + __lsx_vst(_p, outptr, 0); + ptr0 += 4; + outptr += 4; + } + for (int x = 0; x < right; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr0 - 8 - x * 4, 0); + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + ptr -= src.w * 4; + } + // fill center + for (int y = 0; y < src.h; y++) + { + for (int x = 0; x < left; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr + (left - x) * 4, 0); + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + for (int x = 0; x < src.w; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __lsx_vst(_p, outptr, 0); + ptr += 4; + outptr += 4; + } + for (int x = 0; x < right; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr - 8 - x * 4, 0); + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + } + // fill bottom + ptr -= 2 * src.w * 4; + for (int y = 0; y < bottom; y++) + { + const float* ptr0 = ptr; + for (int x = 0; x < left; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr0 + (left - x) * 4, 0); + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + for (int x = 0; x < src.w; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr0, 0); + __lsx_vst(_p, outptr, 0); + ptr0 += 4; + outptr += 4; + } + for (int x = 0; x < right; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr0 - 8 - x * 4, 0); + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + ptr -= src.w * 4; + } +} diff --git a/src/layer/loongarch/padding_pack8_int8.h b/src/layer/loongarch/padding_pack8_int8.h new file mode 100644 index 000000000000..4c6586c6ae27 --- /dev/null +++ b/src/layer/loongarch/padding_pack8_int8.h @@ -0,0 +1,171 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void padding_constant_pack8_int8_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int64_t _v) +{ + const int64_t* ptr = src; + int64_t* outptr = dst; + + // fill top + for (int y = 0; y < top; y++) + { + for (int x = 0; x < dst.w; x++) + { + *outptr++ = _v; + } + } + // fill center + for (int y = 0; y < src.h; y++) + { + for (int x = 0; x < left; x++) + { + *outptr++ = _v; + } + for (int x = 0; x < src.w; x++) + { + *outptr++ = *ptr++; + } + for (int x = 0; x < right; x++) + { + *outptr++ = _v; + } + } + // fill bottom + for (int y = 0; y < bottom; y++) + { + for (int x = 0; x < dst.w; x++) + { + *outptr++ = _v; + } + } +} + +static void padding_replicate_pack8_int8_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right) +{ + const int64_t* ptr = src; + int64_t* outptr = dst; + + // fill top + for (int y = 0; y < top; y++) + { + const int64_t* ptr0 = ptr; + for (int x = 0; x < left; x++) + { + *outptr++ = *ptr0; + } + for (int x = 0; x < src.w; x++) + { + *outptr++ = *ptr0++; + } + for (int x = 0; x < right; x++) + { + *outptr++ = ptr0[-1]; + } + } + // fill center + for (int y = 0; y < src.h; y++) + { + for (int x = 0; x < left; x++) + { + *outptr++ = *ptr; + } + for (int x = 0; x < src.w; x++) + { + *outptr++ = *ptr++; + } + for (int x = 0; x < right; x++) + { + *outptr++ = ptr[-1]; + } + } + // fill bottom + ptr -= src.w; + for (int y = 0; y < bottom; y++) + { + const int64_t* ptr0 = ptr; + for (int x = 0; x < left; x++) + { + *outptr++ = *ptr0; + } + for (int x = 0; x < src.w; x++) + { + *outptr++ = *ptr0++; + } + for (int x = 0; x < right; x++) + { + *outptr++ = ptr0[-1]; + } + } +} + +static void padding_reflect_pack8_int8_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right) +{ + const int64_t* ptr = src; + int64_t* outptr = dst; + + // fill top + ptr += top * src.w; + for (int y = 0; y < top; y++) + { + const int64_t* ptr0 = ptr; + for (int x = 0; x < left; x++) + { + *outptr++ = ptr0[left - x]; + } + for (int x = 0; x < src.w; x++) + { + *outptr++ = *ptr0++; + } + for (int x = 0; x < right; x++) + { + *outptr++ = ptr0[-2 - x]; + } + ptr -= src.w; + } + // fill center + for (int y = 0; y < src.h; y++) + { + for (int x = 0; x < left; x++) + { + *outptr++ = ptr[left - x]; + } + for (int x = 0; x < src.w; x++) + { + *outptr++ = *ptr++; + } + for (int x = 0; x < right; x++) + { + *outptr++ = ptr[-2 - x]; + } + } + // fill bottom + ptr -= 2 * src.w; + for (int y = 0; y < bottom; y++) + { + const int64_t* ptr0 = ptr; + for (int x = 0; x < left; x++) + { + *outptr++ = ptr0[left - x]; + } + for (int x = 0; x < src.w; x++) + { + *outptr++ = *ptr0++; + } + for (int x = 0; x < right; x++) + { + *outptr++ = ptr0[-2 - x]; + } + ptr -= src.w; + } +} diff --git a/src/layer/loongarch/pooling_loongarch.cpp b/src/layer/loongarch/pooling_loongarch.cpp new file mode 100644 index 000000000000..9d9889713244 --- /dev/null +++ b/src/layer/loongarch/pooling_loongarch.cpp @@ -0,0 +1,291 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pooling_loongarch.h" + +#include + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +Pooling_loongarch::Pooling_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Pooling_loongarch::create_pipeline(const Option& /*opt*/) +{ + if (adaptive_pooling) + { + support_packing = false; + + support_bf16_storage = false; + support_fp16_storage = false; + support_int8_storage = false; + support_tensor_storage = false; + } + return 0; +} + +int Pooling_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + if (adaptive_pooling) + { + return Pooling::forward(bottom_blob, top_blob, opt); + } + + // max value in NxN window + // avg value in NxN window + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + +#if __loongarch_sx + // NCNN_LOGE("Pooling input %d x %d pad = %d %d %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h); + + if (elempack == 4) + { + if (global_pooling) + { + top_blob.create(channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int size = w * h; + + if (pooling_type == PoolMethod_MAX) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + + __m128 _max = (__m128)__lsx_vld(ptr, 0); + for (int i = 0; i < size; i++) + { + __m128 _val = (__m128)__lsx_vld(ptr, 0); + _max = __lsx_vfmax_s(_max, _val); + ptr += 4; + } + + float* outptr = top_blob; + __lsx_vst(_max, outptr + q * 4, 0); + } + } + else if (pooling_type == PoolMethod_AVE) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + for (int i = 0; i < size; i++) + { + __m128 _val = (__m128)__lsx_vld(ptr, 0); + _sum = __lsx_vfadd_s(_sum, _val); + ptr += 4; + } + + __m128 _avg = __lsx_vfmul_s(_sum, __lsx_vreplfr2vr_s(1.f / size)); + + float* outptr = top_blob; + __lsx_vst(_avg, outptr + q * 4, 0); + } + } + + return 0; + } + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int outw = (w - kernel_w) / stride_w + 1; + int outh = (h - kernel_h) / stride_h + 1; + + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w - kernel_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2++; + } + p2 += gap; + } + } + + if (pooling_type == PoolMethod_MAX) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + float* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + const float* sptr = m.row(i * stride_h) + j * stride_w * 4; + + __m128 _max = (__m128)__lsx_vld(sptr, 0); + + for (int k = 0; k < maxk; k++) + { + __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0); + _max = __lsx_vfmax_s(_max, _val); + } + + __lsx_vst(_max, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } + } + else if (pooling_type == PoolMethod_AVE) + { + if (avgpool_count_include_pad == 0) + { + int wtailpad = 0; + int htailpad = 0; + + if (pad_mode == 0) // full padding + { + wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right; + htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + float* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + int sy0 = i * stride_h; + + for (int j = 0; j < outw; j++) + { + int sx0 = j * stride_w; + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + int area = 0; + + for (int ki = 0; ki < kernel_h; ki++) + { + int sy = sy0 + ki; + + if (sy < pad_top) + continue; + + if (sy >= h - pad_bottom - htailpad) + break; + + for (int kj = 0; kj < kernel_w; kj++) + { + int sx = sx0 + kj; + + if (sx < pad_left) + continue; + + if (sx >= w - pad_right - wtailpad) + break; + + __m128 _val = (__m128)__lsx_vld(m.row(sy) + sx * 4, 0); + _sum = __lsx_vfadd_s(_sum, _val); + area += 1; + } + } + + __m128 _avg = __lsx_vfmul_s(_sum, __lsx_vreplfr2vr_s(1.f / area)); + __lsx_vst(_avg, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } + } + else // if (avgpool_count_include_pad == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + float* outptr = top_blob.channel(q); + + const float inv_maxk = 1.f / maxk; + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + const float* sptr = m.row(i * stride_h) + j * stride_w * 4; + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + for (int k = 0; k < maxk; k++) + { + __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0); + _sum = __lsx_vfadd_s(_sum, _val); + } + + __m128 _avg = __lsx_vfmul_s(_sum, __lsx_vreplfr2vr_s(inv_maxk)); + __lsx_vst(_avg, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } + } + } + + return 0; + } +#endif // __loongarch_sx + + return Pooling::forward(bottom_blob, top_blob, opt); +} + +} // namespace ncnn diff --git a/src/layer/loongarch/pooling_loongarch.h b/src/layer/loongarch/pooling_loongarch.h new file mode 100644 index 000000000000..97e0c9ff2f7e --- /dev/null +++ b/src/layer/loongarch/pooling_loongarch.h @@ -0,0 +1,33 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_POOLING_LOONGARCH_H +#define LAYER_POOLING_LOONGARCH_H + +#include "pooling.h" + +namespace ncnn { + +class Pooling_loongarch : virtual public Pooling +{ +public: + Pooling_loongarch(); + + virtual int create_pipeline(const Option& opt); + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_POOLING_LOONGARCH_H diff --git a/src/layer/loongarch/prelu_loongarch.cpp b/src/layer/loongarch/prelu_loongarch.cpp new file mode 100644 index 000000000000..27cc0bc9d446 --- /dev/null +++ b/src/layer/loongarch/prelu_loongarch.cpp @@ -0,0 +1,193 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "prelu_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +PReLU_loongarch::PReLU_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int PReLU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int dims = bottom_top_blob.dims; + int elempack = bottom_top_blob.elempack; + + if (dims == 1) + { + int w = bottom_top_blob.w * elempack; + +#if __loongarch_sx + int nn_w = w / 4; + int remain_w_start = nn_w * 4; +#else + int remain_w_start = 0; +#endif // __loongarch_sx + + float* ptr = bottom_top_blob; + + if (num_slope > 1) + { + const float* slope = slope_data; + +#if __loongarch_sx + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < nn_w; i++) + { + float* ptr0 = ptr + i * 4; + + __m128 _p = (__m128)__lsx_vld(ptr0, 0); + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _slope = (__m128)__lsx_vld(slope + i * 4, 0); + __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero); + __m128 _ps = __lsx_vfmul_s(_p, _slope); + _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask); + __lsx_vst(_p, ptr0, 0); + } +#endif // __loongarch_sx + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_w_start; i < w; i++) + { + float v = ptr[i]; + if (v < 0.f) + ptr[i] = v * slope[i]; + } + } + else + { + const float slope = slope_data[0]; + +#if __loongarch_sx + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < nn_w; i++) + { + float* ptr0 = ptr + i * 4; + + __m128 _p = (__m128)__lsx_vld(ptr0, 0); + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); + __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero); + __m128 _ps = __lsx_vfmul_s(_p, _slope); + _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask); + __lsx_vst(_p, ptr0, 0); + } +#endif // __loongarch_sx + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_w_start; i < w; i++) + { + float v = ptr[i]; + if (v < 0.f) + ptr[i] = v * slope; + } + } + } + + if (dims == 2) + { + int w = bottom_top_blob.w * elempack; + int h = bottom_top_blob.h; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.row(i); + + const float slope = num_slope > 1 ? slope_data[i] : slope_data[0]; + + int j = 0; +#if __loongarch_sx + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _slope = (elempack == 4 && num_slope > 1) ? (__m128)__lsx_vld((const float*)slope_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(slope); + + for (; j + 3 < w; j += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero); + __m128 _ps = __lsx_vfmul_s(_p, _slope); + _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; j < w; j++) + { + float v = *ptr; + if (v < 0.f) + *ptr = v * slope; + + ptr++; + } + } + } + + if (dims == 3) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h * elempack; + + const float* slope_data_ptr = slope_data; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0]; + + int i = 0; +#if __loongarch_sx + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _slope = (elempack == 4 && num_slope > 1) ? (__m128)__lsx_vld((const float*)slope_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(slope); + + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero); + __m128 _ps = __lsx_vfmul_s(_p, _slope); + _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + if (*ptr < 0) + *ptr *= slope; + + ptr++; + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/prelu_loongarch.h b/src/layer/loongarch/prelu_loongarch.h new file mode 100644 index 000000000000..97031bb06016 --- /dev/null +++ b/src/layer/loongarch/prelu_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_PRELU_LOONGARCH_H +#define LAYER_PRELU_LOONGARCH_H + +#include "prelu.h" + +namespace ncnn { + +class PReLU_loongarch : virtual public PReLU +{ +public: + PReLU_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_PRELU_LOONGARCH_H diff --git a/src/layer/loongarch/quantize_loongarch.cpp b/src/layer/loongarch/quantize_loongarch.cpp new file mode 100644 index 000000000000..657ff2d06bf5 --- /dev/null +++ b/src/layer/loongarch/quantize_loongarch.cpp @@ -0,0 +1,494 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "quantize_loongarch.h" + +#include + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +Quantize_loongarch::Quantize_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int Quantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + +#if __loongarch_sx + if (elempack == 4) + { + if (dims == 1) + { + int w = bottom_blob.w; + int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; + int outw = w * elempack / out_elempack; + + top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const float* ptr0 = (const float*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8(ptr0[0] * scale); + outptr[1] = float2int8(ptr0[1] * scale); + outptr[2] = float2int8(ptr0[2] * scale); + outptr[3] = float2int8(ptr0[3] * scale); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const float* ptr0 = (const float*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]); + outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]); + outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]); + outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; + int outh = h * elempack / out_elempack; + + top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 8) + { + if (scale_data_size == 1) + { + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* ptr0 = bottom_blob.row(i * 2); + const float* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* outptr = top_blob.row(i); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(ptr0 + 16); + __builtin_prefetch(ptr1 + 16); + __m128 _vlow = (__m128)__lsx_vld(ptr0, 0); + __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0); + _vlow = __lsx_vfmul_s(_vlow, _scale); + _vhigh = __lsx_vfmul_s(_vhigh, _scale); + *((int64_t*)outptr) = float2int8(_vlow, _vhigh); + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* ptr0 = bottom_blob.row(i * 2); + const float* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* outptr = top_blob.row(i); + + __m128 _scale0 = (__m128)__lsx_vld((const float*)scale_data + i * 8, 0); + __m128 _scale1 = (__m128)__lsx_vld((const float*)scale_data + i * 8 + 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(ptr0 + 16); + __builtin_prefetch(ptr1 + 16); + __m128 _vlow = (__m128)__lsx_vld(ptr0, 0); + __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0); + _vlow = __lsx_vfmul_s(_vlow, _scale0); + _vhigh = __lsx_vfmul_s(_vhigh, _scale1); + *((int64_t*)outptr) = float2int8(_vlow, _vhigh); + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * 4); + signed char* outptr1 = top_blob.row(i * 4 + 1); + signed char* outptr2 = top_blob.row(i * 4 + 2); + signed char* outptr3 = top_blob.row(i * 4 + 3); + + for (int j = 0; j < w; j++) + { + outptr0[0] = float2int8(ptr0[0] * scale); + outptr1[0] = float2int8(ptr0[1] * scale); + outptr2[0] = float2int8(ptr0[2] * scale); + outptr3[0] = float2int8(ptr0[3] * scale); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * 4); + signed char* outptr1 = top_blob.row(i * 4 + 1); + signed char* outptr2 = top_blob.row(i * 4 + 2); + signed char* outptr3 = top_blob.row(i * 4 + 3); + + const float s0 = scale_data[i * 4]; + const float s1 = scale_data[i * 4 + 1]; + const float s2 = scale_data[i * 4 + 2]; + const float s3 = scale_data[i * 4 + 3]; + + for (int j = 0; j < w; j++) + { + outptr0[0] = float2int8(ptr0[0] * s0); + outptr1[0] = float2int8(ptr0[1] * s1); + outptr2[0] = float2int8(ptr0[2] * s2); + outptr3[0] = float2int8(ptr0[3] * s3); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; + int outc = channels * elempack / out_elempack; + + top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 8) + { + if (scale_data_size == 1) + { + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* ptr0 = bottom_blob.channel(q * 2); + const float* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* outptr = top_blob.channel(q); + + int i = 0; + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(ptr0 + 32); + __builtin_prefetch(ptr1 + 32); + __m128 _v0 = (__m128)__lsx_vld(ptr0, 0); + __m128 _v1 = (__m128)__lsx_vld(ptr0 + 4, 0); + __m128 _v2 = (__m128)__lsx_vld(ptr1, 0); + __m128 _v3 = (__m128)__lsx_vld(ptr1 + 4, 0); + _v0 = __lsx_vfmul_s(_v0, _scale); + _v1 = __lsx_vfmul_s(_v1, _scale); + _v2 = __lsx_vfmul_s(_v2, _scale); + _v3 = __lsx_vfmul_s(_v3, _scale); + *((int64_t*)outptr) = float2int8(_v0, _v2); + *((int64_t*)(outptr + 8)) = float2int8(_v1, _v3); + + ptr0 += 8; + ptr1 += 8; + outptr += 16; + } + for (; i < size; i++) + { + __builtin_prefetch(ptr0 + 16); + __builtin_prefetch(ptr1 + 16); + __m128 _vlow = (__m128)__lsx_vld(ptr0, 0); + __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0); + _vlow = __lsx_vfmul_s(_vlow, _scale); + _vhigh = __lsx_vfmul_s(_vhigh, _scale); + *((int64_t*)outptr) = float2int8(_vlow, _vhigh); + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* ptr0 = bottom_blob.channel(q * 2); + const float* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* outptr = top_blob.channel(q); + + __m128 _scale0 = (__m128)__lsx_vld((const float*)scale_data + q * 8, 0); + __m128 _scale1 = (__m128)__lsx_vld((const float*)scale_data + q * 8 + 4, 0); + + int i = 0; + for (; i < size; i++) + { + __builtin_prefetch(ptr0 + 16); + __builtin_prefetch(ptr1 + 16); + __m128 _vlow = (__m128)__lsx_vld(ptr0, 0); + __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0); + _vlow = __lsx_vfmul_s(_vlow, _scale0); + _vhigh = __lsx_vfmul_s(_vhigh, _scale1); + *((int64_t*)outptr) = float2int8(_vlow, _vhigh); + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * 4); + signed char* outptr1 = top_blob.channel(q * 4 + 1); + signed char* outptr2 = top_blob.channel(q * 4 + 2); + signed char* outptr3 = top_blob.channel(q * 4 + 3); + + for (int i = 0; i < size; i++) + { + outptr0[0] = float2int8(ptr0[0] * scale); + outptr1[0] = float2int8(ptr0[1] * scale); + outptr2[0] = float2int8(ptr0[2] * scale); + outptr3[0] = float2int8(ptr0[3] * scale); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * 4); + signed char* outptr1 = top_blob.channel(q * 4 + 1); + signed char* outptr2 = top_blob.channel(q * 4 + 2); + signed char* outptr3 = top_blob.channel(q * 4 + 3); + + const float s0 = scale_data[q * 4]; + const float s1 = scale_data[q * 4 + 1]; + const float s2 = scale_data[q * 4 + 2]; + const float s3 = scale_data[q * 4 + 3]; + + for (int i = 0; i < size; i++) + { + outptr0[0] = float2int8(ptr0[0] * s0); + outptr1[0] = float2int8(ptr0[1] * s1); + outptr2[0] = float2int8(ptr0[2] * s2); + outptr3[0] = float2int8(ptr0[3] * s3); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + } + } + + return 0; + } +#endif // __loongarch_sx + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const float* ptr = bottom_blob; + signed char* outptr = top_blob; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + outptr[i] = float2int8(ptr[i] * scale); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + outptr[i] = float2int8(ptr[i] * scale_data[i]); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + + for (int j = 0; j < w; j++) + { + *outptr0++ = float2int8(*ptr0++ * scale); + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + signed char* outptr = top_blob.channel(q); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + + int i = 0; +#if __loongarch_sx + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); + for (; i + 15 < size; i += 16) + { + __builtin_prefetch(ptr + 64); + __m128 _v0 = (__m128)__lsx_vld(ptr, 0); + __m128 _v1 = (__m128)__lsx_vld(ptr + 4, 0); + __m128 _v2 = (__m128)__lsx_vld(ptr + 8, 0); + __m128 _v3 = (__m128)__lsx_vld(ptr + 12, 0); + _v0 = __lsx_vfmul_s(_v0, _scale); + _v1 = __lsx_vfmul_s(_v1, _scale); + _v2 = __lsx_vfmul_s(_v2, _scale); + _v3 = __lsx_vfmul_s(_v3, _scale); + *((int64_t*)outptr) = float2int8(_v0, _v1); + *((int64_t*)(outptr + 8)) = float2int8(_v2, _v3); + + ptr += 16; + outptr += 16; + } + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(ptr + 32); + __m128 _v0 = (__m128)__lsx_vld(ptr, 0); + __m128 _v1 = (__m128)__lsx_vld(ptr + 4, 0); + _v0 = __lsx_vfmul_s(_v0, _scale); + _v1 = __lsx_vfmul_s(_v1, _scale); + *((int64_t*)outptr) = float2int8(_v0, _v1); + + ptr += 8; + outptr += 8; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr++ = float2int8(*ptr++ * scale); + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/quantize_loongarch.h b/src/layer/loongarch/quantize_loongarch.h new file mode 100644 index 000000000000..cae04aab171f --- /dev/null +++ b/src/layer/loongarch/quantize_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_QUANTIZE_LOONGARCH_H +#define LAYER_QUANTIZE_LOONGARCH_H + +#include "quantize.h" + +namespace ncnn { + +class Quantize_loongarch : virtual public Quantize +{ +public: + Quantize_loongarch(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_QUANTIZE_LOONGARCH_H diff --git a/src/layer/loongarch/relu_loongarch.cpp b/src/layer/loongarch/relu_loongarch.cpp new file mode 100644 index 000000000000..eb478d3ae9b1 --- /dev/null +++ b/src/layer/loongarch/relu_loongarch.cpp @@ -0,0 +1,98 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "relu_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +ReLU_loongarch::ReLU_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int ReLU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + if (slope == 0.f) + { + int i = 0; +#if __loongarch_sx + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmax_s(_p, _zero); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + if (*ptr < 0) + *ptr = 0; + ptr++; + } + } + else + { + int i = 0; +#if __loongarch_sx + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero); + __m128 _ps = __lsx_vfmul_s(_p, _slope); + _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + if (*ptr < 0) + *ptr *= slope; + ptr++; + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/relu_loongarch.h b/src/layer/loongarch/relu_loongarch.h new file mode 100644 index 000000000000..445c6e8febca --- /dev/null +++ b/src/layer/loongarch/relu_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_RELU_LOONGARCH_H +#define LAYER_RELU_LOONGARCH_H + +#include "relu.h" + +namespace ncnn { + +class ReLU_loongarch : virtual public ReLU +{ +public: + ReLU_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_RELU_LOONGARCH_H diff --git a/src/layer/loongarch/requantize_leakyrelu_pack4.h b/src/layer/loongarch/requantize_leakyrelu_pack4.h new file mode 100644 index 000000000000..d6b499426609 --- /dev/null +++ b/src/layer/loongarch/requantize_leakyrelu_pack4.h @@ -0,0 +1,271 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void requantize_leakyrelu_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int outc = top_blob.c; + int out_elempack = top_blob.elempack; + + int scale_in_data_size = scale_in_data.w; + int scale_out_data_size = scale_out_data.w; + int bias_data_size = bias_data.w; + + // int8(leakyrelu(v * scale_in, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out), slope) + + // int8(leakyrelu(v * scale_in + bias, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope) + + if (out_elempack == 8) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); + + int i = 0; + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr0 + 64); + __builtin_prefetch(intptr1 + 64); + __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); + __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0)); + __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0)); + __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); + __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0)); + __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0)); + _v00 = __lsx_vfmul_s(_v00, _scale0); + _v01 = __lsx_vfmul_s(_v01, _scale0); + _v02 = __lsx_vfmul_s(_v02, _scale0); + _v03 = __lsx_vfmul_s(_v03, _scale0); + _v10 = __lsx_vfmul_s(_v10, _scale1); + _v11 = __lsx_vfmul_s(_v11, _scale1); + _v12 = __lsx_vfmul_s(_v12, _scale1); + _v13 = __lsx_vfmul_s(_v13, _scale1); + *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope); + *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope); + *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v02, _v12, _slope); + *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v03, _v13, _slope); + + intptr0 += 16; + intptr1 += 16; + ptr += 32; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr0 + 16); + __builtin_prefetch(intptr1 + 16); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); + + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); + _bias0 = __lsx_vfmul_s(_bias0, _scale_out0); + _bias1 = __lsx_vfmul_s(_bias1, _scale_out1); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); + + int i = 0; + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr0 + 64); + __builtin_prefetch(intptr1 + 64); + __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); + __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0)); + __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0)); + __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); + __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0)); + __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0)); + _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0); + _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0); + _v02 = __lsx_vfmadd_s(_scale0, _v02, _bias0); + _v03 = __lsx_vfmadd_s(_scale0, _v03, _bias0); + _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1); + _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1); + _v12 = __lsx_vfmadd_s(_scale1, _v12, _bias1); + _v13 = __lsx_vfmadd_s(_scale1, _v13, _bias1); + *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope); + *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope); + *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v02, _v12, _slope); + *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v03, _v13, _slope); + + intptr0 += 16; + intptr1 += 16; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr0 + 32); + __builtin_prefetch(intptr1 + 32); + __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); + __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); + _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0); + _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0); + _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1); + _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1); + *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope); + *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope); + + intptr0 += 8; + intptr1 += 8; + ptr += 16; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr0 + 16); + __builtin_prefetch(intptr1 + 16); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + signed char* vp; + + __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); + __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); + + __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); + + int i = 0; + for (; i < size; i++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __m128i v = float2int8leakyrelu(_v, _slope); + vp = (signed char*)&v; + ptr0[0] = vp[0]; + ptr1[0] = vp[1]; + ptr2[0] = vp[2]; + ptr3[0] = vp[3]; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + signed char* vp; + + __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); + __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); + __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0); + + __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out); + _bias = __lsx_vfmul_s(_bias, _scale_out); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); + + int i = 0; + for (; i < size; i++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __m128i v = float2int8leakyrelu(_v, _slope); + vp = (signed char*)&v; + ptr0[0] = vp[0]; + ptr1[0] = vp[1]; + ptr2[0] = vp[2]; + ptr3[0] = vp[3]; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + } +} diff --git a/src/layer/loongarch/requantize_leakyrelu_pack8.h b/src/layer/loongarch/requantize_leakyrelu_pack8.h new file mode 100644 index 000000000000..a2c4faed4f2a --- /dev/null +++ b/src/layer/loongarch/requantize_leakyrelu_pack8.h @@ -0,0 +1,188 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void requantize_leakyrelu_pack8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + int scale_in_data_size = scale_in_data.w; + int scale_out_data_size = scale_out_data.w; + int bias_data_size = bias_data.w; + + // int8(leakyrelu(v * scale_in, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out), slope) + + // int8(leakyrelu(v * scale_in + bias, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope) + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); + + int i = 0; + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr + 128); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0)); + __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0)); + __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0)); + __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + _v2 = __lsx_vfmul_s(_v2, _scale0); + _v3 = __lsx_vfmul_s(_v3, _scale1); + _v4 = __lsx_vfmul_s(_v4, _scale0); + _v5 = __lsx_vfmul_s(_v5, _scale1); + _v6 = __lsx_vfmul_s(_v6, _scale0); + _v7 = __lsx_vfmul_s(_v7, _scale1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope); + *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v4, _v5, _slope); + *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v6, _v7, _slope); + + intptr += 32; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr + 64); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + _v2 = __lsx_vfmul_s(_v2, _scale0); + _v3 = __lsx_vfmul_s(_v3, _scale1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope); + + intptr += 16; + ptr += 16; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + + intptr += 8; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); + + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); + _bias0 = __lsx_vfmul_s(_bias0, _scale_out0); + _bias1 = __lsx_vfmul_s(_bias1, _scale_out1); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); + + int i = 0; + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr + 128); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0)); + __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0)); + __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0)); + __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0); + _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1); + _v4 = __lsx_vfmadd_s(_scale0, _v4, _bias0); + _v5 = __lsx_vfmadd_s(_scale1, _v5, _bias1); + _v6 = __lsx_vfmadd_s(_scale0, _v6, _bias0); + _v7 = __lsx_vfmadd_s(_scale1, _v7, _bias1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope); + *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v4, _v5, _slope); + *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v6, _v7, _slope); + + intptr += 32; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr + 64); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0); + _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope); + + intptr += 16; + ptr += 16; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + + intptr += 8; + ptr += 8; + } + } + } +} diff --git a/src/layer/loongarch/requantize_loongarch.cpp b/src/layer/loongarch/requantize_loongarch.cpp new file mode 100644 index 000000000000..556d20de4f6d --- /dev/null +++ b/src/layer/loongarch/requantize_loongarch.cpp @@ -0,0 +1,1386 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "requantize_loongarch.h" + +#include + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_activation.h" +#include "loongarch_usability.h" + +namespace ncnn { + +#if __loongarch_sx +#include "requantize_leakyrelu_pack4.h" +#include "requantize_leakyrelu_pack8.h" +#include "requantize_relu_pack4.h" +#include "requantize_relu_pack8.h" +#endif // __loongarch_sx + +Requantize_loongarch::Requantize_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int Requantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + +#if __loongarch_sx + if (elempack == 8) + { + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_in_data_size == 1 && scale_out_data_size == 1) + { + __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]); + __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale_in); + _v1 = __lsx_vfmul_s(_v1, _scale_in); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out); + _v1 = __lsx_vfmul_s(_v1, _scale_out); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias); + _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out); + _v1 = __lsx_vfmul_s(_v1, _scale_out); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out); + _v1 = __lsx_vfmul_s(_v1, _scale_out); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + } + else if (scale_in_data_size == 1 && scale_out_data_size > 1) + { + __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale_in); + _v1 = __lsx_vfmul_s(_v1, _scale_in); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias); + _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + } + else if (scale_in_data_size > 1 && scale_out_data_size == 1) + { + __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale_in0); + _v1 = __lsx_vfmul_s(_v1, _scale_in1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out); + _v1 = __lsx_vfmul_s(_v1, _scale_out); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias); + _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out); + _v1 = __lsx_vfmul_s(_v1, _scale_out); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out); + _v1 = __lsx_vfmul_s(_v1, _scale_out); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + } + else // if (scale_in_data_size > 1 && scale_out_data_size > 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale_in0); + _v1 = __lsx_vfmul_s(_v1, _scale_in1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias); + _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale_in0); + _v1 = __lsx_vfmul_s(_v1, _scale_in1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + + intptr += 8; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + + intptr += 8; + ptr += 8; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (activation_type == 1) + { + requantize_relu_pack8_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt); + return 0; + } + + if (activation_type == 2 && activation_params[0] > 0.f) + { + requantize_leakyrelu_pack8_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt); + return 0; + } + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + + for (int i = 0; i < size; i++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale_in0); + _v1 = __lsx_vfmul_s(_v1, _scale_in1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + + intptr += 8; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); + + for (int i = 0; i < size; i++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + + intptr += 8; + ptr += 8; + } + } + } + } + + return 0; + } + + if (elempack == 4) + { + if (dims == 1) + { + int w = bottom_blob.w; + int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; + int outw = w * elempack / out_elempack; + + top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_in_data_size == 1 && scale_out_data_size == 1) + { + __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]); + __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale_in); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + } + else if (scale_in_data_size == 1 && scale_out_data_size > 1) + { + __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale_in); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); + __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + } + else if (scale_in_data_size > 1 && scale_out_data_size == 1) + { + __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale_in); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); + __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + } + else // if (scale_in_data_size > 1 && scale_out_data_size > 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); + __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale_in); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); + __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); + __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); + __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; + int outh = h * elempack / out_elempack; + + top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 8) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const int* intptr0 = bottom_blob.row(i * 2); + const int* intptr1 = bottom_blob.row(i * 2 + 1); + signed char* ptr = top_blob.row(i); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr0 + 16); + __builtin_prefetch(intptr1 + 16); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale_in0); + _v1 = __lsx_vfmul_s(_v1, _scale_in1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const int* intptr0 = bottom_blob.row(i * 2); + const int* intptr1 = bottom_blob.row(i * 2 + 1); + signed char* ptr = top_blob.row(i); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr0 + 16); + __builtin_prefetch(intptr1 + 16); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr0 = top_blob.row(i * 4); + signed char* ptr1 = top_blob.row(i * 4 + 1); + signed char* ptr2 = top_blob.row(i * 4 + 2); + signed char* ptr3 = top_blob.row(i * 4 + 3); + + __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); + __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale_in); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr0[0] = v[0]; + ptr1[0] = v[1]; + ptr2[0] = v[2]; + ptr3[0] = v[3]; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr0 = top_blob.row(i * 4); + signed char* ptr1 = top_blob.row(i * 4 + 1); + signed char* ptr2 = top_blob.row(i * 4 + 2); + signed char* ptr3 = top_blob.row(i * 4 + 3); + + __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); + __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); + __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr0[0] = v[0]; + ptr1[0] = v[1]; + ptr2[0] = v[2]; + ptr3[0] = v[3]; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; + int outc = channels * elempack / out_elempack; + + top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (activation_type == 1) + { + requantize_relu_pack4_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt); + return 0; + } + + if (activation_type == 2 && activation_params[0] > 0.f) + { + requantize_leakyrelu_pack4_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt); + return 0; + } + + if (out_elempack == 8) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + + for (int i = 0; i < size; i++) + { + __builtin_prefetch(intptr0 + 16); + __builtin_prefetch(intptr1 + 16); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale_in0); + _v1 = __lsx_vfmul_s(_v1, _scale_in1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); + + for (int i = 0; i < size; i++) + { + __builtin_prefetch(intptr0 + 16); + __builtin_prefetch(intptr1 + 16); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); + __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); + + for (int i = 0; i < size; i++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale_in); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr0[0] = v[0]; + ptr1[0] = v[1]; + ptr2[0] = v[2]; + ptr3[0] = v[3]; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); + __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); + __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0); + + for (int i = 0; i < size; i++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr0[0] = v[0]; + ptr1[0] = v[1]; + ptr2[0] = v[2]; + ptr3[0] = v[3]; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + } + } + + return 0; + } +#endif // __loongarch_sx + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int* intptr = bottom_blob; + signed char* ptr = top_blob; + + if (scale_in_data_size == 1 && scale_out_data_size == 1) + { + const float scale_in = scale_in_data[0]; + const float scale_out = scale_out_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else if (scale_in_data_size == 1 && scale_out_data_size > 1) + { + const float scale_in = scale_in_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + } + else if (scale_in_data_size > 1 && scale_out_data_size == 1) + { + const float scale_out = scale_out_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else // if (scale_in_data_size > 1 && scale_out_data_size > 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; + + for (int j = 0; j < w; j++) + { + float v = intptr[j] * scale_in; + ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; + + for (int j = 0; j < w; j++) + { + float v = intptr[j] * scale_in + bias; + ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; + + for (int i = 0; i < size; i++) + { + float v = intptr[i] * scale_in; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; + + for (int i = 0; i < size; i++) + { + float v = intptr[i] * scale_in + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/requantize_loongarch.h b/src/layer/loongarch/requantize_loongarch.h new file mode 100644 index 000000000000..8175989959eb --- /dev/null +++ b/src/layer/loongarch/requantize_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_REQUANTIZE_LOONGARCH_H +#define LAYER_REQUANTIZE_LOONGARCH_H + +#include "requantize.h" + +namespace ncnn { + +class Requantize_loongarch : virtual public Requantize +{ +public: + Requantize_loongarch(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_REQUANTIZE_LOONGARCH_H diff --git a/src/layer/loongarch/requantize_relu_pack4.h b/src/layer/loongarch/requantize_relu_pack4.h new file mode 100644 index 000000000000..2fba8dfc2e48 --- /dev/null +++ b/src/layer/loongarch/requantize_relu_pack4.h @@ -0,0 +1,267 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void requantize_relu_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int outc = top_blob.c; + int out_elempack = top_blob.elempack; + + int scale_in_data_size = scale_in_data.w; + int scale_out_data_size = scale_out_data.w; + int bias_data_size = bias_data.w; + + // int8(relu(v * scale_in) * scale_out) + // int8_relu(v * (scale_in * scale_out)) + + // int8(relu(v * scale_in + bias) * scale_out) + // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) + + if (out_elempack == 8) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); + + int i = 0; + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr0 + 64); + __builtin_prefetch(intptr1 + 64); + __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); + __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0)); + __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0)); + __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); + __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0)); + __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0)); + _v00 = __lsx_vfmul_s(_v00, _scale0); + _v01 = __lsx_vfmul_s(_v01, _scale0); + _v02 = __lsx_vfmul_s(_v02, _scale0); + _v03 = __lsx_vfmul_s(_v03, _scale0); + _v10 = __lsx_vfmul_s(_v10, _scale1); + _v11 = __lsx_vfmul_s(_v11, _scale1); + _v12 = __lsx_vfmul_s(_v12, _scale1); + _v13 = __lsx_vfmul_s(_v13, _scale1); + *((int64_t*)ptr) = float2int8relu(_v00, _v10); + *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11); + *((int64_t*)(ptr + 16)) = float2int8relu(_v02, _v12); + *((int64_t*)(ptr + 24)) = float2int8relu(_v03, _v13); + + intptr0 += 16; + intptr1 += 16; + ptr += 32; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr0 + 16); + __builtin_prefetch(intptr1 + 16); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); + + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); + _bias0 = __lsx_vfmul_s(_bias0, _scale_out0); + _bias1 = __lsx_vfmul_s(_bias1, _scale_out1); + + int i = 0; + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr0 + 64); + __builtin_prefetch(intptr1 + 64); + __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); + __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0)); + __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0)); + __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); + __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0)); + __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0)); + _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0); + _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0); + _v02 = __lsx_vfmadd_s(_scale0, _v02, _bias0); + _v03 = __lsx_vfmadd_s(_scale0, _v03, _bias0); + _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1); + _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1); + _v12 = __lsx_vfmadd_s(_scale1, _v12, _bias1); + _v13 = __lsx_vfmadd_s(_scale1, _v13, _bias1); + *((int64_t*)ptr) = float2int8relu(_v00, _v10); + *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11); + *((int64_t*)(ptr + 16)) = float2int8relu(_v02, _v12); + *((int64_t*)(ptr + 24)) = float2int8relu(_v03, _v13); + + intptr0 += 16; + intptr1 += 16; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr0 + 32); + __builtin_prefetch(intptr1 + 32); + __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); + __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); + _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0); + _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0); + _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1); + _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1); + *((int64_t*)ptr) = float2int8relu(_v00, _v10); + *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11); + + intptr0 += 8; + intptr1 += 8; + ptr += 16; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr0 + 16); + __builtin_prefetch(intptr1 + 16); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + signed char* vp; + + __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); + __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); + + __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out); + + int i = 0; + for (; i < size; i++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __m128i v = float2int8relu(_v); + vp = (signed char*)&v; + ptr0[0] = vp[0]; + ptr1[0] = vp[1]; + ptr2[0] = vp[2]; + ptr3[0] = vp[3]; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + signed char* vp; + + __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); + __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); + __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0); + + __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out); + _bias = __lsx_vfmul_s(_bias, _scale_out); + + int i = 0; + for (; i < size; i++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __m128i v = float2int8relu(_v); + vp = (signed char*)&v; + ptr0[0] = vp[0]; + ptr1[0] = vp[1]; + ptr2[0] = vp[2]; + ptr3[0] = vp[3]; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + } +} diff --git a/src/layer/loongarch/requantize_relu_pack8.h b/src/layer/loongarch/requantize_relu_pack8.h new file mode 100644 index 000000000000..3d2a45b45d06 --- /dev/null +++ b/src/layer/loongarch/requantize_relu_pack8.h @@ -0,0 +1,186 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void requantize_relu_pack8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + int scale_in_data_size = scale_in_data.w; + int scale_out_data_size = scale_out_data.w; + int bias_data_size = bias_data.w; + + // int8(relu(v * scale_in) * scale_out) + // int8_relu(v * (scale_in * scale_out)) + + // int8(relu(v * scale_in + bias) * scale_out) + // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); + + int i = 0; + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr + 128); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0)); + __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0)); + __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0)); + __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + _v2 = __lsx_vfmul_s(_v2, _scale0); + _v3 = __lsx_vfmul_s(_v3, _scale1); + _v4 = __lsx_vfmul_s(_v4, _scale0); + _v5 = __lsx_vfmul_s(_v5, _scale1); + _v6 = __lsx_vfmul_s(_v6, _scale0); + _v7 = __lsx_vfmul_s(_v7, _scale1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3); + *((int64_t*)(ptr + 16)) = float2int8relu(_v4, _v5); + *((int64_t*)(ptr + 24)) = float2int8relu(_v6, _v7); + + intptr += 32; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr + 64); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + _v2 = __lsx_vfmul_s(_v2, _scale0); + _v3 = __lsx_vfmul_s(_v3, _scale1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3); + + intptr += 16; + ptr += 16; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + + intptr += 8; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); + + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); + _bias0 = __lsx_vfmul_s(_bias0, _scale_out0); + _bias1 = __lsx_vfmul_s(_bias1, _scale_out1); + + int i = 0; + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr + 128); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0)); + __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0)); + __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0)); + __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0); + _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1); + _v4 = __lsx_vfmadd_s(_scale0, _v4, _bias0); + _v5 = __lsx_vfmadd_s(_scale1, _v5, _bias1); + _v6 = __lsx_vfmadd_s(_scale0, _v6, _bias0); + _v7 = __lsx_vfmadd_s(_scale1, _v7, _bias1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3); + *((int64_t*)(ptr + 16)) = float2int8relu(_v4, _v5); + *((int64_t*)(ptr + 24)) = float2int8relu(_v6, _v7); + + intptr += 32; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr + 64); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0); + _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3); + + intptr += 16; + ptr += 16; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + + intptr += 8; + ptr += 8; + } + } + } +} diff --git a/src/layer/loongarch/sigmoid_loongarch.cpp b/src/layer/loongarch/sigmoid_loongarch.cpp new file mode 100644 index 000000000000..6d112804f269 --- /dev/null +++ b/src/layer/loongarch/sigmoid_loongarch.cpp @@ -0,0 +1,76 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "sigmoid_loongarch.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +#include + +namespace ncnn { + +Sigmoid_loongarch::Sigmoid_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int Sigmoid_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = (__m128)__lsx_vbitrevi_w((__m128i)_p, 31); + _p = exp_ps(_p); + _p = __lsx_vfadd_s(_p, _one); + __m128 _outp = __lsx_vfdiv_s(_one, _p); + __lsx_vst(_outp, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = 1.f / (1.f + exp(-*ptr)); + + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/sigmoid_loongarch.h b/src/layer/loongarch/sigmoid_loongarch.h new file mode 100644 index 000000000000..b15aad235db5 --- /dev/null +++ b/src/layer/loongarch/sigmoid_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_SIGMOID_LOONGARCH_H +#define LAYER_SIGMOID_LOONGARCH_H + +#include "sigmoid.h" + +namespace ncnn { + +class Sigmoid_loongarch : virtual public Sigmoid +{ +public: + Sigmoid_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_SIGMOID_LOONGARCH_H diff --git a/src/layer/loongarch/slice_loongarch.cpp b/src/layer/loongarch/slice_loongarch.cpp new file mode 100644 index 000000000000..edd8656a4bb3 --- /dev/null +++ b/src/layer/loongarch/slice_loongarch.cpp @@ -0,0 +1,371 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "slice_loongarch.h" + +namespace ncnn { + +Slice_loongarch::Slice_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Slice_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + const int* slices_ptr = slices; + int positive_axis = axis < 0 ? dims + axis : axis; + + if (dims == 1) // positive_axis == 0 + { + // slice vector + int w = bottom_blob.w * elempack; + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) + { + slice = (w - q) / (top_blobs.size() - i); + } + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + out_elempack = slice % 4 == 0 ? 4 : 1; +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat& top_blob = top_blobs[i]; + top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const float* ptr = (const float*)bottom_blob + q; + float* outptr = top_blob; + memcpy(outptr, ptr, top_blob.w * top_blob.elemsize); + + q += slice; + } + } + + if (dims == 2 && positive_axis == 0) + { + // slice image height + int w = bottom_blob.w; + int h = bottom_blob.h * elempack; + + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) + { + slice = (h - q) / (top_blobs.size() - i); + } + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + out_elempack = slice % 4 == 0 ? 4 : 1; +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat& top_blob = top_blobs[i]; + top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + q += slice; + } + + size_t out_elemsize = top_blobs[0].elemsize; + int out_elempack = top_blobs[0].elempack; + for (size_t i = 0; i < top_blobs.size(); i++) + { + out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize); + out_elempack = std::min(out_elempack, top_blobs[i].elempack); + } + + Mat bottom_blob_unpacked = bottom_blob; + if (elempack > out_elempack) + { + convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt); + } + + const float* ptr = bottom_blob_unpacked; + for (size_t i = 0; i < top_blobs.size(); i++) + { + Mat& top_blob = top_blobs[i]; + + if (out_elempack == 1 && top_blob.elempack == 4) + { + for (int j = 0; j < top_blob.h; j++) + { + const float* r0 = ptr; + const float* r1 = ptr + w; + const float* r2 = ptr + w * 2; + const float* r3 = ptr + w * 3; + + float* outptr0 = top_blob.row(j); + + for (int j = 0; j < w; j++) + { + outptr0[0] = *r0++; + outptr0[1] = *r1++; + outptr0[2] = *r2++; + outptr0[3] = *r3++; + + outptr0 += 4; + } + + ptr += w * 4; + } + } + else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4) + { + int size = w * top_blob.h; + + float* outptr = top_blob; + memcpy(outptr, ptr, size * top_blob.elemsize); + + ptr += size * top_blob.elempack; + } + } + } + + if (dims == 2 && positive_axis == 1) + { + // slice image width + int w = bottom_blob.w; + int h = bottom_blob.h; + + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) + { + slice = (w - q) / (top_blobs.size() - i); + } + + Mat& top_blob = top_blobs[i]; + top_blob.create(slice, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + q += slice; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < h; j++) + { + const float* ptr = bottom_blob.row(j); + for (size_t i = 0; i < top_blobs.size(); i++) + { + Mat& top_blob = top_blobs[i]; + + float* outptr = top_blob.row(j); + memcpy(outptr, ptr, top_blob.w * elemsize); + + ptr += top_blob.w * elempack; + } + } + } + + if (dims == 3 && positive_axis == 0) + { + // slice dim channel + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c * elempack; + + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) + { + slice = (channels - q) / (top_blobs.size() - i); + } + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + out_elempack = slice % 4 == 0 ? 4 : 1; +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat& top_blob = top_blobs[i]; + top_blob.create(w, h, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + q += slice; + } + + size_t out_elemsize = top_blobs[0].elemsize; + int out_elempack = top_blobs[0].elempack; + for (size_t i = 0; i < top_blobs.size(); i++) + { + out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize); + out_elempack = std::min(out_elempack, top_blobs[i].elempack); + } + + Mat bottom_blob_unpacked = bottom_blob; + if (elempack > out_elempack) + { + convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt); + } + + int p = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + Mat& top_blob = top_blobs[i]; + + if (out_elempack == 1 && top_blob.elempack == 4) + { + int size = top_blob.w * top_blob.h; + + for (int q = 0; q < top_blob.c; q++) + { + const float* r0 = bottom_blob_unpacked.channel(p); + const float* r1 = bottom_blob_unpacked.channel(p + 1); + const float* r2 = bottom_blob_unpacked.channel(p + 2); + const float* r3 = bottom_blob_unpacked.channel(p + 3); + + float* outptr0 = top_blob.channel(q); + + for (int j = 0; j < size; j++) + { + outptr0[0] = *r0++; + outptr0[1] = *r1++; + outptr0[2] = *r2++; + outptr0[3] = *r3++; + + outptr0 += 4; + } + + p += 4; + } + } + else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4) + { + int size = top_blob.total(); + + const float* ptr = bottom_blob_unpacked.channel(p); + float* outptr = top_blob; + memcpy(outptr, ptr, size * top_blob.elemsize); + + p += top_blob.c; + } + } + } + + if (dims == 3 && positive_axis == 1) + { + // slice dim height + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) + { + slice = (h - q) / (top_blobs.size() - i); + } + + Mat& top_blob = top_blobs[i]; + top_blob.create(w, slice, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + q += slice; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const float* ptr = bottom_blob.channel(p); + + for (size_t i = 0; i < top_blobs.size(); i++) + { + Mat& top_blob = top_blobs[i]; + + int size = top_blob.w * top_blob.h; + + float* outptr = top_blob.channel(p); + memcpy(outptr, ptr, size * elemsize); + + ptr += size * elempack; + } + } + } + + if (dims == 3 && positive_axis == 2) + { + // slice dim width + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) + { + slice = (w - q) / (top_blobs.size() - i); + } + + Mat& top_blob = top_blobs[i]; + top_blob.create(slice, h, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + q += slice; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const float* ptr = bottom_blob.channel(p); + + for (int j = 0; j < h; j++) + { + for (size_t i = 0; i < top_blobs.size(); i++) + { + Mat& top_blob = top_blobs[i]; + + float* outptr = top_blob.channel(p).row(j); + memcpy(outptr, ptr, top_blob.w * elemsize); + + ptr += top_blob.w * elempack; + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/slice_loongarch.h b/src/layer/loongarch/slice_loongarch.h new file mode 100644 index 000000000000..b42138ba4183 --- /dev/null +++ b/src/layer/loongarch/slice_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_SLICE_LOONGARCH_H +#define LAYER_SLICE_LOONGARCH_H + +#include "slice.h" + +namespace ncnn { + +class Slice_loongarch : virtual public Slice +{ +public: + Slice_loongarch(); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_SLICE_LOONGARCH_H diff --git a/src/layer/loongarch/softmax_loongarch.cpp b/src/layer/loongarch/softmax_loongarch.cpp new file mode 100644 index 000000000000..88b49559754b --- /dev/null +++ b/src/layer/loongarch/softmax_loongarch.cpp @@ -0,0 +1,175 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "softmax_loongarch.h" + +#include +#include + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +namespace ncnn { + +int Softmax_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int dims = bottom_top_blob.dims; + size_t elemsize = bottom_top_blob.elemsize; + int positive_axis = axis < 0 ? dims + axis : axis; + + if (dims != 3 || positive_axis != 0) + return Softmax::forward_inplace(bottom_top_blob, opt); + + // value = exp( value - global max value ) + // sum all value + // value = value / sum + + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + + Mat max; + max.create(w, h, elemsize, opt.workspace_allocator); + if (max.empty()) + return -100; + max.fill(-FLT_MAX); + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + float* maxptr = max; + + for (int i = 0; i < size; i++) + { + maxptr[i] = std::max(maxptr[i], ptr[i]); + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + float* maxptr = max; + +#if __loongarch_sx + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __loongarch_sx + +#if __loongarch_sx + for (; nn > 0; nn--) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _max = (__m128)__lsx_vld(maxptr, 0); + + _p = exp_ps(__lsx_vfsub_s(_p, _max)); + + __lsx_vst(_p, ptr, 0); + + ptr += 4; + maxptr += 4; + } +#endif // __loongarch_sx + + for (; remain > 0; remain--) + { + *ptr = exp(*ptr - *maxptr); + + ptr++; + maxptr++; + } + } + + Mat sum; + sum.create(w, h, elemsize, opt.workspace_allocator); + if (sum.empty()) + return -100; + sum.fill(0.f); + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + float* sumptr = sum; + +#if __loongarch_sx + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __loongarch_sx + +#if __loongarch_sx + for (; nn > 0; nn--) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _sum = (__m128)__lsx_vld(sumptr, 0); + _sum = __lsx_vfadd_s(_sum, _p); + __lsx_vst(_sum, sumptr, 0); + + ptr += 4; + sumptr += 4; + } +#endif // __loongarch_sx + + for (; remain > 0; remain--) + { + *sumptr += *ptr; + + ptr++; + sumptr++; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + float* sumptr = sum; + +#if __loongarch_sx + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __loongarch_sx + +#if __loongarch_sx + for (; nn > 0; nn--) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _sum = (__m128)__lsx_vld(sumptr, 0); + _p = __lsx_vfdiv_s(_p, _sum); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + sumptr += 4; + } +#endif // __loongarch_sx + + for (; remain > 0; remain--) + { + *ptr /= *sumptr; + + ptr++; + sumptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/softmax_loongarch.h b/src/layer/loongarch/softmax_loongarch.h new file mode 100644 index 000000000000..3c8272a6412f --- /dev/null +++ b/src/layer/loongarch/softmax_loongarch.h @@ -0,0 +1,30 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_SOFTMAX_LOONGARCH_H +#define LAYER_SOFTMAX_LOONGARCH_H + +#include "softmax.h" + +namespace ncnn { + +class Softmax_loongarch : virtual public Softmax +{ +public: + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_SOFTMAX_LOONGARCH_H diff --git a/src/layer/loongarch/swish_loongarch.cpp b/src/layer/loongarch/swish_loongarch.cpp new file mode 100644 index 000000000000..9c9005de6fcc --- /dev/null +++ b/src/layer/loongarch/swish_loongarch.cpp @@ -0,0 +1,70 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "swish_loongarch.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +#include + +namespace ncnn { + +Swish_loongarch::Swish_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Swish_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128i _p = __lsx_vld(ptr, 0); + _p = (__m128i)__lsx_vfdiv_s((__m128)_p, __lsx_vfadd_s(_one, exp_ps((__m128)__lsx_vbitrevi_w(_p, 31)))); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = *ptr / (1.f + exp(-*ptr)); + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/swish_loongarch.h b/src/layer/loongarch/swish_loongarch.h new file mode 100644 index 000000000000..b8d0b80f01e4 --- /dev/null +++ b/src/layer/loongarch/swish_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_SWISH_LOONGARCH_H +#define LAYER_SWISH_LOONGARCH_H + +#include "swish.h" + +namespace ncnn { + +class Swish_loongarch : virtual public Swish +{ +public: + Swish_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_SWISH_LOONGARCH_H diff --git a/src/layer/loongarch/tanh_loongarch.cpp b/src/layer/loongarch/tanh_loongarch.cpp new file mode 100644 index 000000000000..13227fa71e34 --- /dev/null +++ b/src/layer/loongarch/tanh_loongarch.cpp @@ -0,0 +1,69 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "tanh_loongarch.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +#include + +namespace ncnn { + +TanH_loongarch::TanH_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int TanH_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = tanh_ps(_p); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = tanh(*ptr); + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/tanh_loongarch.h b/src/layer/loongarch/tanh_loongarch.h new file mode 100644 index 000000000000..ecbab01ec8fe --- /dev/null +++ b/src/layer/loongarch/tanh_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_TANH_LOONGARCH_H +#define LAYER_TANH_LOONGARCH_H + +#include "tanh.h" + +namespace ncnn { + +class TanH_loongarch : virtual public TanH +{ +public: + TanH_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_TANH_LOONGARCH_H diff --git a/src/layer/loongarch/unaryop_loongarch.cpp b/src/layer/loongarch/unaryop_loongarch.cpp new file mode 100644 index 000000000000..892c4dc42608 --- /dev/null +++ b/src/layer/loongarch/unaryop_loongarch.cpp @@ -0,0 +1,427 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "unaryop_loongarch.h" + +#include + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +namespace ncnn { + +UnaryOp_loongarch::UnaryOp_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +template +static int unary_op_inplace(Mat& a, const Option& opt) +{ + Op op; + + int w = a.w; + int h = a.h; + int d = a.d; + int channels = a.c; + int elempack = a.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = a.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = op.func_pack4(_p); + __lsx_vst(_p, ptr, 0); + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = op.func(*ptr); + ptr++; + } + } + + return 0; +} + +namespace UnaryOp_loongarch_functor { + +struct unary_op_abs +{ + float func(const float& x) const + { + return (float)fabs(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return (__m128)__lsx_vbitclri_w((__m128i)x, 31); + } +#endif // __loongarch_sx +}; + +struct unary_op_neg +{ + float func(const float& x) const + { + return -x; + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return (__m128)__lsx_vbitrevi_w((__m128i)x, 31); + } +#endif // __loongarch_sx +}; + +struct unary_op_floor +{ + float func(const float& x) const + { + return (float)floor(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + // TODO msa optimize + float tmp[4]; + __lsx_vst(x, tmp, 0); + tmp[0] = floor(tmp[0]); + tmp[1] = floor(tmp[1]); + tmp[2] = floor(tmp[2]); + tmp[3] = floor(tmp[3]); + return (__m128)__lsx_vld(tmp, 0); + } +#endif // __loongarch_sx +}; + +struct unary_op_ceil +{ + float func(const float& x) const + { + return (float)ceil(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + // TODO msa optimize + float tmp[4]; + __lsx_vst(x, tmp, 0); + tmp[0] = ceil(tmp[0]); + tmp[1] = ceil(tmp[1]); + tmp[2] = ceil(tmp[2]); + tmp[3] = ceil(tmp[3]); + return (__m128)__lsx_vld(tmp, 0); + } +#endif // __loongarch_sx +}; + +struct unary_op_square +{ + float func(const float& x) const + { + return x * x; + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return __lsx_vfmul_s(x, x); + } +#endif // __loongarch_sx +}; + +struct unary_op_sqrt +{ + float func(const float& x) const + { + return (float)sqrt(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return __lsx_vfsqrt_s(x); + } +#endif // __loongarch_sx +}; + +struct unary_op_rsqrt +{ + float func(const float& x) const + { + return (float)(1.f / sqrt(x)); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return __lsx_vfrsqrt_s(x); + } +#endif // __loongarch_sx +}; + +struct unary_op_exp +{ + float func(const float& x) const + { + return (float)exp(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return exp_ps(x); + } +#endif // __loongarch_sx +}; + +struct unary_op_log +{ + float func(const float& x) const + { + return (float)log(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return log_ps(x); + } +#endif // __loongarch_sx +}; + +struct unary_op_sin +{ + float func(const float& x) const + { + return (float)sin(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + // TODO msa optimize + float tmp[4]; + __lsx_vst(x, tmp, 0); + tmp[0] = sin(tmp[0]); + tmp[1] = sin(tmp[1]); + tmp[2] = sin(tmp[2]); + tmp[3] = sin(tmp[3]); + return (__m128)__lsx_vld(tmp, 0); + } +#endif // __loongarch_sx +}; + +struct unary_op_cos +{ + float func(const float& x) const + { + return (float)cos(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + // TODO msa optimize + float tmp[4]; + __lsx_vst(x, tmp, 0); + tmp[0] = cos(tmp[0]); + tmp[1] = cos(tmp[1]); + tmp[2] = cos(tmp[2]); + tmp[3] = cos(tmp[3]); + return (__m128)__lsx_vld(tmp, 0); + } +#endif // __loongarch_sx +}; + +struct unary_op_tan +{ + float func(const float& x) const + { + return (float)tan(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + // TODO msa optimize + float tmp[4]; + __lsx_vst(x, tmp, 0); + tmp[0] = tan(tmp[0]); + tmp[1] = tan(tmp[1]); + tmp[2] = tan(tmp[2]); + tmp[3] = tan(tmp[3]); + return (__m128)__lsx_vld(tmp, 0); + } +#endif // __loongarch_sx +}; + +struct unary_op_asin +{ + float func(const float& x) const + { + return (float)asin(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + // TODO msa optimize + float tmp[4]; + __lsx_vst(x, tmp, 0); + tmp[0] = asin(tmp[0]); + tmp[1] = asin(tmp[1]); + tmp[2] = asin(tmp[2]); + tmp[3] = asin(tmp[3]); + return (__m128)__lsx_vld(tmp, 0); + } +#endif // __loongarch_sx +}; + +struct unary_op_acos +{ + float func(const float& x) const + { + return (float)acos(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + // TODO msa optimize + float tmp[4]; + __lsx_vst(x, tmp, 0); + tmp[0] = acos(tmp[0]); + tmp[1] = acos(tmp[1]); + tmp[2] = acos(tmp[2]); + tmp[3] = acos(tmp[3]); + return (__m128)__lsx_vld(tmp, 0); + } +#endif // __loongarch_sx +}; + +struct unary_op_atan +{ + float func(const float& x) const + { + return (float)atan(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + // TODO msa optimize + float tmp[4]; + __lsx_vst(x, tmp, 0); + tmp[0] = atan(tmp[0]); + tmp[1] = atan(tmp[1]); + tmp[2] = atan(tmp[2]); + tmp[3] = atan(tmp[3]); + return (__m128)__lsx_vld(tmp, 0); + } +#endif // __loongarch_sx +}; + +struct unary_op_reciprocal +{ + float func(const float& x) const + { + return 1.f / x; + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return __lsx_vfrecip_s(x); + } +#endif // __loongarch_sx +}; + +struct unary_op_tanh +{ + float func(const float& x) const + { + return (float)tanh(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return tanh_ps(x); + } +#endif // __loongarch_sx +}; + +} // namespace UnaryOp_loongarch_functor + +int UnaryOp_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + using namespace UnaryOp_loongarch_functor; + + if (op_type == Operation_ABS) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_NEG) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_FLOOR) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_CEIL) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_SQUARE) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_SQRT) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_RSQRT) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_EXP) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_LOG) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_SIN) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_COS) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_TAN) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_ASIN) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_ACOS) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_ATAN) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_RECIPROCAL) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_TANH) + return unary_op_inplace(bottom_top_blob, opt); + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/unaryop_loongarch.h b/src/layer/loongarch/unaryop_loongarch.h new file mode 100644 index 000000000000..8170bec50cf8 --- /dev/null +++ b/src/layer/loongarch/unaryop_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_UNARYOP_LOONGARCH_H +#define LAYER_UNARYOP_LOONGARCH_H + +#include "unaryop.h" + +namespace ncnn { + +class UnaryOp_loongarch : virtual public UnaryOp +{ +public: + UnaryOp_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_UNARYOP_LOONGARCH_H diff --git a/src/layer_registry.h.in b/src/layer_registry.h.in index 99c1d8336f41..6947ecce5d18 100644 --- a/src/layer_registry.h.in +++ b/src/layer_registry.h.in @@ -28,6 +28,12 @@ static const layer_registry_entry layer_registry_msa[] = { }; #endif // NCNN_RUNTIME_CPU && NCNN_MSA +#if NCNN_RUNTIME_CPU && NCNN_LSX +static const layer_registry_entry layer_registry_lsx[] = { +@layer_registry_lsx@ +}; +#endif // NCNN_RUNTIME_CPU && NCNN_LSX + #if NCNN_RUNTIME_CPU && NCNN_RVV static const layer_registry_entry layer_registry_rvv[] = { @layer_registry_rvv@ diff --git a/src/mat.h b/src/mat.h index e534def504fc..c6f59ef42684 100644 --- a/src/mat.h +++ b/src/mat.h @@ -29,6 +29,9 @@ #if __mips_msa #include #endif +#if __loongarch_sx +#include +#endif #if __riscv_vector #include #include "cpu.h" // cpu_riscv_vlenb() @@ -128,6 +131,9 @@ class NCNN_EXPORT Mat #if __mips_msa void fill(v4f32 _v); #endif // __mips_msa +#if __loongarch_sx + void fill(__m128 _v); +#endif //__loongarch_sx #if __riscv_vector void fill(vfloat32m1_t _v); void fill(vuint16m1_t _v); @@ -1067,6 +1073,18 @@ NCNN_FORCEINLINE void Mat::fill(v4f32 _v) } #endif // __mips_msa +#if __loongarch_sx +NCNN_FORCEINLINE void Mat::fill(__m128 _v) +{ + int size = (int)total(); + float* ptr = (float*)data; + for (int i = 0; i < size; i++) + { + __lsx_vst(_v, ptr, 0); + ptr += 4; + } +} +#endif // __loongarch_sx #if __riscv_vector NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v) { diff --git a/src/platform.h.in b/src/platform.h.in index 755f8294bc29..219cff4aada9 100644 --- a/src/platform.h.in +++ b/src/platform.h.in @@ -55,6 +55,7 @@ #cmakedefine01 NCNN_ARM86SVEF32MM #endif // __aarch64__ #cmakedefine01 NCNN_MSA +#cmakedefine01 NCNN_LSX #cmakedefine01 NCNN_MMI #cmakedefine01 NCNN_RVV #cmakedefine01 NCNN_INT8