diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4ece7ada739..3349ea506eb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -290,6 +290,19 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips)")
     else()
         message(WARNING "The compiler does not support loongson mmi extension. NCNN_MMI will be OFF.")
     endif()
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch64|loongarch32)")
+    set(NCNN_TARGET_ARCH loongarch)
+
+    include(CheckCXXCompilerFlag)
+
+    check_cxx_compiler_flag("-mlsx" NCNN_COMPILER_SUPPORT_LOONGARCH_LSX)
+
+    if(NCNN_COMPILER_SUPPORT_LOONGARCH_LSX)
+        option(NCNN_LSX "optimize loongarch platform with lsx extension" ON)
+    else()
+        message(WARNING "The compiler does not support lsx extension. NCNN_LSX will be OFF.")
+    endif()
+
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
     set(NCNN_TARGET_ARCH riscv)
 
@@ -332,8 +345,6 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
     endif()
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)")
     set(NCNN_TARGET_ARCH powerpc)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch)")
-    set(NCNN_TARGET_ARCH mips)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(xtensa)")
     set(NCNN_TARGET_ARCH xtensa)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x)")
diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake
index 8abb13331a9..857d3b528ba 100644
--- a/cmake/ncnn_add_layer.cmake
+++ b/cmake/ncnn_add_layer.cmake
@@ -270,6 +270,12 @@ macro(ncnn_add_layer class)
         endif()
     endif()
 
+    if(NCNN_RUNTIME_CPU AND NCNN_TARGET_ARCH STREQUAL "loongarch")
+        if(NCNN_LSX)
+            ncnn_add_arch_opt_layer(${class} lsx "-mlsx")
+        endif()
+    endif()
+
     if(NCNN_RUNTIME_CPU AND NCNN_RVV AND NCNN_TARGET_ARCH STREQUAL "riscv")
         if(NCNN_COMPILER_SUPPORT_RVV_ZFH)
             ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh")
diff --git a/cmake/ncnn_generate_lsx_source.cmake b/cmake/ncnn_generate_lsx_source.cmake
new file mode 100644
index 00000000000..4f8fb20299a
--- /dev/null
+++ b/cmake/ncnn_generate_lsx_source.cmake
@@ -0,0 +1,14 @@
+
+# must define SRC DST CLASS
+
+file(READ ${SRC} source_data)
+
+# replace
+string(TOUPPER ${CLASS} CLASS_UPPER)
+string(TOLOWER ${CLASS} CLASS_LOWER)
+
+string(REGEX REPLACE "LAYER_${CLASS_UPPER}_LOONGARCH_H" "LAYER_${CLASS_UPPER}_LOONGARCH_LSX_H" source_data "${source_data}")
+string(REGEX REPLACE "${CLASS}_loongarch" "${CLASS}_loongarch_lsx" source_data "${source_data}")
+string(REGEX REPLACE "#include \"${CLASS_LOWER}_loongarch.h\"" "#include \"${CLASS_LOWER}_loongarch_lsx.h\"" source_data "${source_data}")
+
+file(WRITE ${DST} "${source_data}")
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 11b8573462a..bb978674959 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -457,6 +457,12 @@ if(NCNN_TARGET_ARCH STREQUAL "mips")
     endif()
 endif()
 
+if(NCNN_TARGET_ARCH STREQUAL "loongarch")
+    if(NOT NCNN_RUNTIME_CPU AND NCNN_LSX)
+        target_compile_options(ncnn PRIVATE -mlsx)
+    endif()
+endif()
+
 if(NCNN_TARGET_ARCH STREQUAL "riscv" AND NOT C906)
     if(NOT NCNN_RUNTIME_CPU AND NCNN_RVV)
         if(NCNN_COMPILER_SUPPORT_RVV_ZFH)
diff --git a/src/cpu.cpp b/src/cpu.cpp
index 197093d6dd2..ca90860ff01 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -159,7 +159,7 @@ static unsigned int get_elf_hwcap_from_proc_self_auxv(unsigned int type)
         return 0;
     }
 
-#if __aarch64__ || __mips64 || __riscv_xlen == 64
+#if __aarch64__ || __mips64 || __riscv_xlen == 64 || __loongarch64
     struct
     {
         uint64_t tag;
@@ -236,6 +236,12 @@ static unsigned int g_hwcaps2 = get_elf_hwcap(AT_HWCAP2);
 #define HWCAP_LOONGSON_MMI (1 << 11)
 #endif
 
+#if __loongarch64
+// from arch/loongarch/include/uapi/asm/hwcap.h
+#define HWCAP_LOONGARCH_LSX  (1 << 4)
+#define HWCAP_LOONGARCH_LASX (1 << 5)
+#endif
+
 #if __riscv
 // from arch/riscv/include/uapi/asm/hwcap.h
 #define COMPAT_HWCAP_ISA_F (1 << ('F' - 'A'))
@@ -1001,6 +1007,32 @@ int cpu_support_mips_msa()
 #endif
 }
 
+int cpu_support_loongarch_lsx()
+{
+#if defined __ANDROID__ || defined __linux__
+#if __loongarch64
+    return g_hwcaps & HWCAP_LOONGARCH_LSX;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
+}
+
+int cpu_support_loongarch_lasx()
+{
+#if defined __ANDROID__ || defined __linux__
+#if __loongarch64
+    return g_hwcaps & HWCAP_LOONGARCH_LASX;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
+}
+
 int cpu_support_loongson_mmi()
 {
 #if defined __ANDROID__ || defined __linux__
diff --git a/src/cpu.h b/src/cpu.h
index 5a94106ef47..54bacc0c25a 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -93,6 +93,11 @@ NCNN_EXPORT int cpu_support_x86_avx512_bf16();
 // avx512_fp16 = x86 avx512 fp16
 NCNN_EXPORT int cpu_support_x86_avx512_fp16();
 
+// lsx = loongarch lsx
+NCNN_EXPORT int cpu_support_loongarch_lsx();
+// lasx = loongarch lasx
+NCNN_EXPORT int cpu_support_loongarch_lasx();
+
 // msa = mips mas
 NCNN_EXPORT int cpu_support_mips_msa();
 // mmi = loongson mmi
diff --git a/src/layer.cpp b/src/layer.cpp
index 518b666ec23..953aebcd2bd 100644
--- a/src/layer.cpp
+++ b/src/layer.cpp
@@ -253,6 +253,13 @@ Layer* create_layer(int index)
     }
     else
 #endif // NCNN_RUNTIME_CPU && NCNN_AVX
+#if NCNN_RUNTIME_CPU && NCNN_LSX
+    if (ncnn::cpu_support_loongarch_lsx())
+    {
+        layer_creator = layer_registry_lsx[index].creator;
+    }
+    else
+#endif // NCNN_RUNTIME_CPU && NCNN_LSX
 #if NCNN_RUNTIME_CPU && NCNN_MSA
     if (ncnn::cpu_support_mips_msa())
     {
diff --git a/src/layer/loongarch/absval_loongarch.cpp b/src/layer/loongarch/absval_loongarch.cpp
new file mode 100644
index 00000000000..ea60b01eaf0
--- /dev/null
+++ b/src/layer/loongarch/absval_loongarch.cpp
@@ -0,0 +1,67 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "absval_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+AbsVal_loongarch::AbsVal_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int AbsVal_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128i _p = __lsx_vld(ptr, 0);
+            __m128i _outp = __lsx_vbitclri_w(_p, 31);
+            __lsx_vst(_outp, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = *ptr > 0 ? *ptr : -*ptr;
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/absval_loongarch.h b/src/layer/loongarch/absval_loongarch.h
new file mode 100644
index 00000000000..0a3143cea43
--- /dev/null
+++ b/src/layer/loongarch/absval_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_ABSVAL_LOONGARCH_H
+#define LAYER_ABSVAL_LOONGARCH_H
+
+#include "absval.h"
+
+namespace ncnn {
+
+class AbsVal_loongarch : virtual public AbsVal
+{
+public:
+    AbsVal_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ABSVAL_LOONGARCH_H
diff --git a/src/layer/loongarch/batchnorm_loongarch.cpp b/src/layer/loongarch/batchnorm_loongarch.cpp
new file mode 100644
index 00000000000..f0e33b78efd
--- /dev/null
+++ b/src/layer/loongarch/batchnorm_loongarch.cpp
@@ -0,0 +1,145 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "batchnorm_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+BatchNorm_loongarch::BatchNorm_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int BatchNorm_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int dims = bottom_top_blob.dims;
+    int elempack = bottom_top_blob.elempack;
+
+    if (dims == 1)
+    {
+        int w = bottom_top_blob.w * elempack;
+
+#if __loongarch_sx
+        int nn_w = w / 4;
+        int remain_w_start = nn_w * 4;
+#else
+        int remain_w_start = 0;
+#endif // __loongarch_sx
+
+        float* ptr = bottom_top_blob;
+
+#if __loongarch_sx
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < nn_w; i++)
+        {
+            float* ptr0 = ptr + i * 4;
+
+            __m128 _p = (__m128)__lsx_vld(ptr0, 0);
+            __m128 _a = (__m128)__lsx_vld((const float*)a_data + i * 4, 0);
+            __m128 _b = (__m128)__lsx_vld((const float*)b_data + i * 4, 0);
+            _p = __lsx_vfmadd_s(_b, _p, _a);
+            __lsx_vst(_p, ptr0, 0);
+        }
+#endif // __loongarch_sx
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_w_start; i < w; i++)
+        {
+            ptr[i] = b_data[i] * ptr[i] + a_data[i];
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_top_blob.w * elempack;
+        int h = bottom_top_blob.h;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            float* ptr = bottom_top_blob.row(i);
+            float a = a_data[i];
+            float b = b_data[i];
+
+            int j = 0;
+#if __loongarch_sx
+            __m128 _a = elempack == 4 ? (__m128)__lsx_vld((const float*)a_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(a);
+            __m128 _b = elempack == 4 ? (__m128)__lsx_vld((const float*)b_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(b);
+            for (; j + 3 < w; j += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                _p = __lsx_vfmadd_s(_b, _p, _a);
+                __lsx_vst(_p, ptr, 0);
+
+                ptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; j < w; j++)
+            {
+                *ptr = b * *ptr + a;
+                ptr++;
+            }
+        }
+    }
+
+    if (dims == 3 || dims == 4)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        int d = bottom_top_blob.d;
+        int c = bottom_top_blob.c;
+        int size = w * h * d * elempack;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < c; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+            float a = a_data[q];
+            float b = b_data[q];
+
+            int i = 0;
+#if __loongarch_sx
+            __m128 _a = elempack == 4 ? (__m128)__lsx_vld((const float*)a_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(a);
+            __m128 _b = elempack == 4 ? (__m128)__lsx_vld((const float*)b_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(b);
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                _p = __lsx_vfmadd_s(_b, _p, _a);
+                __lsx_vst(_p, ptr, 0);
+
+                ptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                *ptr = b * *ptr + a;
+                ptr++;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/batchnorm_loongarch.h b/src/layer/loongarch/batchnorm_loongarch.h
new file mode 100644
index 00000000000..8b38d5e1f66
--- /dev/null
+++ b/src/layer/loongarch/batchnorm_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_BATCHNORM_LOONGARCH_H
+#define LAYER_BATCHNORM_LOONGARCH_H
+
+#include "batchnorm.h"
+
+namespace ncnn {
+
+class BatchNorm_loongarch : virtual public BatchNorm
+{
+public:
+    BatchNorm_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_BATCHNORM_LOONGARCH_H
diff --git a/src/layer/loongarch/bias_loongarch.cpp b/src/layer/loongarch/bias_loongarch.cpp
new file mode 100644
index 00000000000..74129a8d328
--- /dev/null
+++ b/src/layer/loongarch/bias_loongarch.cpp
@@ -0,0 +1,70 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "bias_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+int Bias_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int size = w * h * d;
+
+    const float* bias_ptr = bias_data;
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        float bias = bias_ptr[q];
+
+#if __loongarch_sx
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __loongarch_sx
+
+#if __loongarch_sx
+        __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias);
+        for (; nn > 0; nn--)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __m128 _outp = __lsx_vfadd_s(_p, _bias);
+            __lsx_vst(_outp, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+
+        for (; remain > 0; remain--)
+        {
+            *ptr = *ptr + bias;
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/bias_loongarch.h b/src/layer/loongarch/bias_loongarch.h
new file mode 100644
index 00000000000..f122ffa0dd9
--- /dev/null
+++ b/src/layer/loongarch/bias_loongarch.h
@@ -0,0 +1,30 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_BIAS_LOONGARCH_H
+#define LAYER_BIAS_LOONGARCH_H
+
+#include "bias.h"
+
+namespace ncnn {
+
+class Bias_loongarch : virtual public Bias
+{
+public:
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_BIAS_LOONGARCH_H
diff --git a/src/layer/loongarch/binaryop_loongarch.cpp b/src/layer/loongarch/binaryop_loongarch.cpp
new file mode 100644
index 00000000000..7832c9ca732
--- /dev/null
+++ b/src/layer/loongarch/binaryop_loongarch.cpp
@@ -0,0 +1,1066 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "binaryop_loongarch.h"
+
+#include <math.h>
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+BinaryOp_loongarch::BinaryOp_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+template<typename Op>
+static int binary_op_2_3_4_20(const Mat& a, const Mat& b, Mat& c, const Option& opt)
+{
+    Op op;
+
+    int w = b.w;
+    int h = b.h;
+    int d = b.d;
+    int channels = b.c;
+    int elempack = b.elempack;
+    int size = w * h * d * elempack;
+
+    // type 2 3 4 20
+    c.create_like(b, opt.blob_allocator);
+    if (c.empty())
+        return -100;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float a0 = a[0];
+        const float* ptr = b.channel(q);
+        float* outptr = c.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _a0 = __lsx_vreplfr2vr_s(a0);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __m128 _outp = op(_a0, _p);
+            __lsx_vst(_outp, outptr, 0);
+            ptr += 4;
+            outptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *outptr = op(a0, *ptr);
+            ptr += 1;
+            outptr += 1;
+        }
+    }
+
+    return 0;
+}
+
+template<typename Op>
+static int binary_op_6_11_16_25(const Mat& a, const Mat& b, Mat& c, const Option& opt)
+{
+    Op op;
+
+    int w = a.w;
+    int h = a.h;
+    int d = a.d;
+    int channels = a.c;
+    int elempack = a.elempack;
+    int size = w * h * d * elempack;
+
+    // type 6 11 16 25
+    c.create_like(a, opt.blob_allocator);
+    if (c.empty())
+        return -100;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* ptr = a.channel(q);
+        const float b0 = b[0];
+        float* outptr = c.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _b0 = __lsx_vreplfr2vr_s(b0);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __m128 _outp = op(_p, _b0);
+            __lsx_vst(_outp, outptr, 0);
+            ptr += 4;
+            outptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *outptr = op(*ptr, b0);
+            ptr += 1;
+            outptr += 1;
+        }
+    }
+
+    return 0;
+}
+
+template<typename Op>
+static int binary_op_7_13_19_29(const Mat& a, const Mat& b, Mat& c, const Option& opt)
+{
+    Op op;
+
+    int w = a.w;
+    int h = a.h;
+    int d = a.d;
+    int channels = a.c;
+    int elempack = a.elempack;
+    int size = w * h * d * elempack;
+
+    // type 7 13 19 29
+    c.create_like(a, opt.blob_allocator);
+    if (c.empty())
+        return -100;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* ptr = a.channel(q);
+        const float* ptr1 = b.channel(q);
+        float* outptr = c.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __builtin_prefetch(ptr1 + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+            __m128 _outp = op(_p, _p1);
+            __lsx_vst(_outp, outptr, 0);
+            ptr += 4;
+            ptr1 += 4;
+            outptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *outptr = op(*ptr, *ptr1);
+            ptr += 1;
+            ptr1 += 1;
+            outptr += 1;
+        }
+    }
+
+    return 0;
+}
+
+#if __loongarch_sx
+// broadcasting rule
+// https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting
+
+template<typename Op>
+static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt)
+{
+    Op op;
+
+    int w = a.w;
+    int h = a.h;
+    int d = a.d;
+    int channels = a.c;
+    int size = w * h * d;
+    size_t elemsize = a.elemsize;
+    int elempack = a.elempack;
+
+    int w1 = b.w;
+    int h1 = b.h;
+    int d1 = b.d;
+    int channels1 = b.c;
+    int size1 = w1 * h1 * d1;
+    size_t elemsize1 = b.elemsize;
+    int elempack1 = b.elempack;
+
+    if (a.dims == 4)
+    {
+        if (b.dims == 4)
+        {
+            // type 29
+            return binary_op_7_13_19_29<Op>(a, b, c, opt);
+        }
+
+        c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator);
+        if (c.empty())
+            return -100;
+
+        if (b.dims == 3)
+        {
+            // type 28
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = a.channel(q);
+                const float* ptr1 = b.channel(q);
+                float* outptr = c.channel(q);
+
+                for (int z = 0; z < d; z++)
+                {
+                    for (int y = 0; y < h; y++)
+                    {
+                        __m128 _b0 = (__m128)__lsx_vld(ptr1, 0);
+                        for (int x = 0; x < w; x++)
+                        {
+                            __builtin_prefetch(ptr + 16);
+                            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                            __m128 _outp = op(_p, _b0);
+                            __lsx_vst(_outp, outptr, 0);
+                            ptr += 4;
+                            outptr += 4;
+                        }
+
+                        ptr1 += 4;
+                    }
+                }
+            }
+
+            return 0;
+        }
+
+        if (b.dims == 2)
+        {
+            // type 27
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = a.channel(q);
+                const float* ptr1 = b.row(q);
+                float* outptr = c.channel(q);
+
+                for (int z = 0; z < d; z++)
+                {
+                    __m128 _b0 = (__m128)__lsx_vld(ptr1, 0);
+                    for (int y = 0; y < h; y++)
+                    {
+                        for (int x = 0; x < w; x++)
+                        {
+                            __builtin_prefetch(ptr + 16);
+                            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                            __m128 _outp = op(_p, _b0);
+                            __lsx_vst(_outp, outptr, 0);
+                            ptr += 4;
+                            outptr += 4;
+                        }
+                    }
+
+                    ptr1 += 4;
+                }
+            }
+
+            return 0;
+        }
+
+        if (b.dims == 1)
+        {
+            if (b.w == 1 && elempack1 == 1)
+            {
+                // type 25
+                return binary_op_6_11_16_25<Op>(a, b, c, opt);
+            }
+
+            // type 26
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = a.channel(q);
+                __m128 _b0 = (__m128)__lsx_vld((const float*)b + q * 4, 0);
+                float* outptr = c.channel(q);
+
+                for (int i = 0; i < size; i++)
+                {
+                    __builtin_prefetch(ptr + 16);
+                    __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                    __m128 _outp = op(_p, _b0);
+                    __lsx_vst(_outp, outptr, 0);
+                    ptr += 4;
+                    outptr += 4;
+                }
+            }
+
+            return 0;
+        }
+    }
+    else if (a.dims == 3)
+    {
+        if (b.dims == 4)
+        {
+            // type 23
+            c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
+            if (c.empty())
+                return -100;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels1; q++)
+            {
+                const float* ptr = a.channel(q);
+                const float* ptr1 = b.channel(q);
+                float* outptr = c.channel(q);
+
+                for (int z = 0; z < d1; z++)
+                {
+                    for (int y = 0; y < h1; y++)
+                    {
+                        __m128 _a0 = (__m128)__lsx_vld(ptr, 0);
+                        for (int x = 0; x < w1; x++)
+                        {
+                            __builtin_prefetch(ptr1 + 16);
+                            __m128 _p = (__m128)__lsx_vld(ptr1, 0);
+                            __m128 _outp = op(_a0, _p);
+                            __lsx_vst(_outp, outptr, 0);
+                            ptr1 += 4;
+                            outptr += 4;
+                        }
+
+                        ptr += 4;
+                    }
+                }
+            }
+
+            return 0;
+        }
+
+        if (b.dims == 3)
+        {
+            if (w1 == 1 && h1 == 1 && channels1 == channels)
+            {
+                // special type 1
+                c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
+                if (c.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const float* ptr = a.channel(q);
+                    const float* b0 = b.channel(q);
+                    float* outptr = c.channel(q);
+                    __m128 _b0 = (__m128)__lsx_vld(b0, 0);
+                    for (int i = 0; i < size; i++)
+                    {
+                        __builtin_prefetch(ptr + 16);
+                        __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                        __m128 _outp = op(_p, _b0);
+                        __lsx_vst(_outp, outptr, 0);
+                        ptr += 4;
+                        outptr += 4;
+                    }
+                }
+
+                return 0;
+            }
+
+            if (w1 == w && h1 == h && channels1 == 1 && elempack1 == 1)
+            {
+                // special type 2
+                c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
+                if (c.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const float* ptr = a.channel(q);
+                    const float* ptr1 = b;
+                    float* outptr = c.channel(q);
+                    for (int i = 0; i < size; i++)
+                    {
+                        __builtin_prefetch(ptr + 16);
+                        __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                        __m128 _p1 = __lsx_vreplfr2vr_s(ptr1[0]);
+                        __m128 _outp = op(_p, _p1);
+                        __lsx_vst(_outp, outptr, 0);
+                        ptr += 4;
+                        ptr1 += 1;
+                        outptr += 4;
+                    }
+                }
+
+                return 0;
+            }
+
+            if (w == 1 && h == 1 && channels1 == channels)
+            {
+                // special type 3
+                c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
+                if (c.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels1; q++)
+                {
+                    const float* a0 = a.channel(q);
+                    const float* ptr1 = b.channel(q);
+                    float* outptr = c.channel(q);
+                    __m128 _a0 = (__m128)__lsx_vld(a0, 0);
+                    for (int i = 0; i < size1; i++)
+                    {
+                        __builtin_prefetch(ptr1 + 16);
+                        __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                        __m128 _outp = op(_a0, _p1);
+                        __lsx_vst(_outp, outptr, 0);
+                        ptr1 += 4;
+                        outptr += 4;
+                    }
+                }
+
+                return 0;
+            }
+
+            if (w1 == w && h1 == h && channels == 1 && elempack == 1)
+            {
+                // special type 4
+                c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
+                if (c.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels1; q++)
+                {
+                    const float* ptr = a;
+                    const float* ptr1 = b.channel(q);
+                    float* outptr = c.channel(q);
+                    for (int i = 0; i < size1; i++)
+                    {
+                        __builtin_prefetch(ptr + 16);
+                        __builtin_prefetch(ptr1 + 16);
+                        __m128 _p = __lsx_vreplfr2vr_s(ptr[0]);
+                        __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                        __m128 _outp = op(_p, _p1);
+                        __lsx_vst(_outp, outptr, 0);
+                        ptr += 1;
+                        ptr1 += 4;
+                        outptr += 4;
+                    }
+                }
+
+                return 0;
+            }
+
+            if (w != 1 && w1 == 1 && h1 == h && channels1 == channels)
+            {
+                // special type 5
+                c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
+                if (c.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels1; q++)
+                {
+                    const float* ptr = a.channel(q);
+                    const float* ptr1 = b.channel(q);
+                    float* outptr = c.channel(q);
+
+                    for (int y = 0; y < h; y++)
+                    {
+                        __m128 _p1 = (__m128)__lsx_vld(ptr1 + y * 4, 0);
+                        for (int x = 0; x < w; x++)
+                        {
+                            __builtin_prefetch(ptr + 16);
+                            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                            __m128 _outp = op(_p, _p1);
+                            __lsx_vst(_outp, outptr, 0);
+
+                            ptr += 4;
+                            outptr += 4;
+                        }
+                    }
+                }
+
+                return 0;
+            }
+
+            if (w1 == w && h != 1 && h1 == 1 && channels1 == channels)
+            {
+                // special type 6
+                c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
+                if (c.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels1; q++)
+                {
+                    const float* ptr = a.channel(q);
+                    const float* ptr1 = b.channel(q);
+                    float* outptr = c.channel(q);
+
+                    for (int y = 0; y < h; y++)
+                    {
+                        for (int x = 0; x < w; x++)
+                        {
+                            __builtin_prefetch(ptr + 16);
+                            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                            __m128 _p1 = (__m128)__lsx_vld(ptr1 + x * 4, 0);
+                            __m128 _outp = op(_p, _p1);
+                            __lsx_vst(_outp, outptr, 0);
+
+                            ptr += 4;
+                            outptr += 4;
+                        }
+                    }
+                }
+
+                return 0;
+            }
+
+            if (w1 != 1 && w == 1 && h1 == h && channels1 == channels)
+            {
+                // special type 7
+                c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
+                if (c.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels1; q++)
+                {
+                    const float* ptr = a.channel(q);
+                    const float* ptr1 = b.channel(q);
+                    float* outptr = c.channel(q);
+
+                    for (int y = 0; y < h1; y++)
+                    {
+                        __m128 _p = (__m128)__lsx_vld(ptr + y * 4, 0);
+                        for (int x = 0; x < w1; x++)
+                        {
+                            __builtin_prefetch(ptr1 + 16);
+                            __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                            __m128 _outp = op(_p, _p1);
+                            __lsx_vst(_outp, outptr, 0);
+
+                            ptr1 += 4;
+                            outptr += 4;
+                        }
+                    }
+                }
+
+                return 0;
+            }
+
+            if (w1 == w && h1 != 1 && h == 1 && channels1 == channels)
+            {
+                // special type 8
+                c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
+                if (c.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels1; q++)
+                {
+                    const float* ptr = a.channel(q);
+                    const float* ptr1 = b.channel(q);
+                    float* outptr = c.channel(q);
+
+                    for (int y = 0; y < h1; y++)
+                    {
+                        for (int x = 0; x < w1; x++)
+                        {
+                            __builtin_prefetch(ptr1 + 16);
+                            __m128 _p = (__m128)__lsx_vld(ptr + x * 4, 0);
+                            __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                            __m128 _outp = op(_p, _p1);
+                            __lsx_vst(_outp, outptr, 0);
+
+                            ptr1 += 4;
+                            outptr += 4;
+                        }
+                    }
+                }
+
+                return 0;
+            }
+
+            // type 19
+            return binary_op_7_13_19_29<Op>(a, b, c, opt);
+        }
+
+        c.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
+        if (c.empty())
+            return -100;
+
+        if (b.dims == 2)
+        {
+            // type 18
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = a.channel(q);
+                const float* ptr1 = b.row(q);
+                float* outptr = c.channel(q);
+
+                for (int y = 0; y < h; y++)
+                {
+                    __m128 _b0 = (__m128)__lsx_vld(ptr1, 0);
+                    for (int x = 0; x < w; x++)
+                    {
+                        __builtin_prefetch(ptr + 16);
+                        __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                        __m128 _outp = op(_p, _b0);
+                        __lsx_vst(_outp, outptr, 0);
+                        ptr += 4;
+                        outptr += 4;
+                    }
+
+                    ptr1 += 4;
+                }
+            }
+
+            return 0;
+        }
+
+        if (b.dims == 1)
+        {
+            if (b.w == 1 && elempack1 == 1)
+            {
+                // type 16
+                return binary_op_6_11_16_25<Op>(a, b, c, opt);
+            }
+
+            // type 17
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = a.channel(q);
+                __m128 _b0 = (__m128)__lsx_vld((const float*)b + q * 4, 0);
+                float* outptr = c.channel(q);
+
+                for (int i = 0; i < size; i++)
+                {
+                    __builtin_prefetch(ptr + 16);
+                    __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                    __m128 _outp = op(_p, _b0);
+                    __lsx_vst(_outp, outptr, 0);
+                    ptr += 4;
+                    outptr += 4;
+                }
+            }
+
+            return 0;
+        }
+    }
+    else if (a.dims == 2)
+    {
+        if (b.dims == 4)
+        {
+            // type 22
+            c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
+            if (c.empty())
+                return -100;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels1; q++)
+            {
+                const float* ptr = a.row(q);
+                const float* ptr1 = b.channel(q);
+                float* outptr = c.channel(q);
+
+                for (int z = 0; z < d1; z++)
+                {
+                    __m128 _a0 = (__m128)__lsx_vld(ptr, 0);
+                    for (int y = 0; y < h1; y++)
+                    {
+                        for (int x = 0; x < w1; x++)
+                        {
+                            __builtin_prefetch(ptr1 + 16);
+                            __m128 _p = (__m128)__lsx_vld(ptr1, 0);
+                            __m128 _outp = op(_a0, _p);
+                            __lsx_vst(_outp, outptr, 0);
+                            ptr1 += 4;
+                            outptr += 4;
+                        }
+                    }
+
+                    ptr += 4;
+                }
+            }
+
+            return 0;
+        }
+
+        if (b.dims == 3)
+        {
+            // type 14
+            c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
+            if (c.empty())
+                return -100;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels1; q++)
+            {
+                const float* ptr = a.row(q);
+                const float* ptr1 = b.channel(q);
+                float* outptr = c.channel(q);
+
+                for (int y = 0; y < h1; y++)
+                {
+                    __m128 _a0 = (__m128)__lsx_vld(ptr, 0);
+                    for (int x = 0; x < w1; x++)
+                    {
+                        __builtin_prefetch(ptr1 + 16);
+                        __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                        __m128 _outp = op(_a0, _p1);
+                        __lsx_vst(_outp, outptr, 0);
+                        ptr1 += 4;
+                        outptr += 4;
+                    }
+
+                    ptr += 4;
+                }
+            }
+
+            return 0;
+        }
+
+        c.create(w, h, elemsize, elempack, opt.blob_allocator);
+        if (c.empty())
+            return -100;
+
+        if (b.dims == 2)
+        {
+            // type 13
+            return binary_op_7_13_19_29<Op>(a, b, c, opt);
+        }
+
+        if (b.dims == 1)
+        {
+            c.create(w, h, elemsize, elempack, opt.blob_allocator);
+            if (c.empty())
+                return -100;
+
+            if (b.w == 1 && elempack1 == 1)
+            {
+                // type 11
+                return binary_op_6_11_16_25<Op>(a, b, c, opt);
+            }
+
+            // type 12
+            const float* ptr = a;
+            const float* ptr1 = b;
+            float* outptr = c;
+
+            for (int y = 0; y < h; y++)
+            {
+                __m128 _b0 = (__m128)__lsx_vld(ptr1, 0);
+                for (int x = 0; x < w; x++)
+                {
+                    __builtin_prefetch(ptr + 16);
+                    __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                    __m128 _outp = op(_p, _b0);
+                    __lsx_vst(_outp, outptr, 0);
+                    ptr += 4;
+                    outptr += 4;
+                }
+
+                ptr1 += 4;
+            }
+
+            return 0;
+        }
+    }
+    else if (a.dims == 1)
+    {
+        if (a.w == 1 && elempack == 1)
+        {
+            // type 2 3 4 20
+            return binary_op_2_3_4_20<Op>(a, b, c, opt);
+        }
+
+        if (b.dims == 4)
+        {
+            // type 21
+            c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator);
+            if (c.empty())
+                return -100;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels1; q++)
+            {
+                __m128 _a0 = (__m128)__lsx_vld((const float*)a + q * 4, 0);
+                const float* ptr1 = b.channel(q);
+                float* outptr = c.channel(q);
+
+                for (int i = 0; i < size1; i++)
+                {
+                    __builtin_prefetch(ptr1 + 16);
+                    __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                    __m128 _outp = op(_a0, _p1);
+                    __lsx_vst(_outp, outptr, 0);
+                    ptr1 += 4;
+                    outptr += 4;
+                }
+            }
+
+            return 0;
+        }
+
+        if (b.dims == 3)
+        {
+            // type 9
+            c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator);
+            if (c.empty())
+                return -100;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels1; q++)
+            {
+                __m128 _a0 = (__m128)__lsx_vld((const float*)a + q * 4, 0);
+                const float* ptr1 = b.channel(q);
+                float* outptr = c.channel(q);
+
+                for (int i = 0; i < size1; i++)
+                {
+                    __builtin_prefetch(ptr1 + 16);
+                    __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                    __m128 _outp = op(_a0, _p1);
+                    __lsx_vst(_outp, outptr, 0);
+                    ptr1 += 4;
+                    outptr += 4;
+                }
+            }
+
+            return 0;
+        }
+
+        if (b.dims == 2)
+        {
+            // type 8
+            c.create(w1, h1, elemsize1, elempack1, opt.blob_allocator);
+            if (c.empty())
+                return -100;
+
+            const float* ptr = a;
+            const float* ptr1 = b;
+            float* outptr = c;
+
+            for (int y = 0; y < h1; y++)
+            {
+                __m128 _a0 = (__m128)__lsx_vld(ptr, 0);
+                for (int x = 0; x < w1; x++)
+                {
+                    __builtin_prefetch(ptr1 + 16);
+                    __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                    __m128 _outp = op(_a0, _p1);
+                    __lsx_vst(_outp, outptr, 0);
+                    ptr1 += 4;
+                    outptr += 4;
+                }
+
+                ptr += 4;
+            }
+
+            return 0;
+        }
+
+        if (b.dims == 1)
+        {
+            c.create(w, elemsize, elempack, opt.blob_allocator);
+            if (c.empty())
+                return -100;
+
+            if (b.w == 1 && elempack1 == 1)
+            {
+                // type 6
+                return binary_op_6_11_16_25<Op>(a, b, c, opt);
+            }
+
+            // type 7
+            binary_op_7_13_19_29<Op>(a, b, c, opt);
+        }
+    }
+
+    return 0;
+}
+#endif // __loongarch_sx
+
+template<typename Op>
+static int binary_op_scalar_inplace(Mat& a, float b, const Option& opt)
+{
+    Op op;
+
+    int w = a.w;
+    int h = a.h;
+    int d = a.d;
+    int channels = a.c;
+    int elempack = a.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = a.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _b = __lsx_vreplfr2vr_s(b);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = op(_p, _b);
+            __lsx_vst(_p, ptr, 0);
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = op(*ptr, b);
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+namespace BinaryOp_loongarch_functor {
+
+#if __loongarch_sx
+#define MAKE_FUNCTION(NAME, IMPL, IMPL4)                          \
+    struct NAME                                                   \
+    {                                                             \
+        float operator()(const float& x, const float& y) const    \
+        {                                                         \
+            return IMPL;                                          \
+        }                                                         \
+        __m128 operator()(const __m128& x, const __m128& y) const \
+        {                                                         \
+            return IMPL4;                                         \
+        }                                                         \
+    };
+#else
+#define MAKE_FUNCTION(NAME, IMPL, IMPL4)                       \
+    struct NAME                                                \
+    {                                                          \
+        float operator()(const float& x, const float& y) const \
+        {                                                      \
+            return IMPL;                                       \
+        }                                                      \
+    };
+#endif // __loongarch_sx
+
+// clang-format off
+// *INDENT-OFF*
+MAKE_FUNCTION(binary_op_add, x + y, __lsx_vfadd_s(x, y))
+MAKE_FUNCTION(binary_op_sub, x - y, __lsx_vfsub_s(x, y))
+MAKE_FUNCTION(binary_op_mul, x * y, __lsx_vfmul_s(x, y))
+MAKE_FUNCTION(binary_op_div, x / y, __lsx_vfdiv_s(x, y))
+MAKE_FUNCTION(binary_op_max, std::max(x, y), __lsx_vfmax_s(x, y))
+MAKE_FUNCTION(binary_op_min, std::min(x, y), __lsx_vfmin_s(x, y))
+MAKE_FUNCTION(binary_op_pow, (float)pow(x, y), pow_ps(x, y))
+MAKE_FUNCTION(binary_op_rsub, y - x, __lsx_vfsub_s(y, x))
+MAKE_FUNCTION(binary_op_rdiv, y / x, __lsx_vfdiv_s(y, x))
+// *INDENT-ON*
+// clang-format on
+
+#undef MAKE_FUNCTION
+
+} // namespace BinaryOp_loongarch_functor
+
+int BinaryOp_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+#if __loongarch_sx
+    using namespace BinaryOp_loongarch_functor;
+
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& bottom_blob1 = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    int elempack = bottom_blob.elempack;
+    int elempack1 = bottom_blob1.elempack;
+
+    if (elempack == 4 || elempack1 == 4)
+    {
+        if (op_type == Operation_ADD)
+            return binary_op_pack4<binary_op_add>(bottom_blob, bottom_blob1, top_blob, opt);
+
+        if (op_type == Operation_SUB)
+            return binary_op_pack4<binary_op_sub>(bottom_blob, bottom_blob1, top_blob, opt);
+
+        if (op_type == Operation_MUL)
+            return binary_op_pack4<binary_op_mul>(bottom_blob, bottom_blob1, top_blob, opt);
+
+        if (op_type == Operation_DIV)
+            return binary_op_pack4<binary_op_div>(bottom_blob, bottom_blob1, top_blob, opt);
+
+        if (op_type == Operation_MAX)
+            return binary_op_pack4<binary_op_max>(bottom_blob, bottom_blob1, top_blob, opt);
+
+        if (op_type == Operation_MIN)
+            return binary_op_pack4<binary_op_min>(bottom_blob, bottom_blob1, top_blob, opt);
+
+        if (op_type == Operation_POW)
+            return binary_op_pack4<binary_op_pow>(bottom_blob, bottom_blob1, top_blob, opt);
+
+        if (op_type == Operation_RSUB)
+            return binary_op_pack4<binary_op_sub>(bottom_blob1, bottom_blob, top_blob, opt);
+
+        if (op_type == Operation_RDIV)
+            return binary_op_pack4<binary_op_div>(bottom_blob1, bottom_blob, top_blob, opt);
+    }
+#endif // __loongarch_sx
+
+    return BinaryOp::forward(bottom_blobs, top_blobs, opt);
+}
+
+int BinaryOp_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    using namespace BinaryOp_loongarch_functor;
+
+    if (op_type == Operation_ADD)
+        return binary_op_scalar_inplace<binary_op_add>(bottom_top_blob, b, opt);
+
+    if (op_type == Operation_SUB)
+        return binary_op_scalar_inplace<binary_op_sub>(bottom_top_blob, b, opt);
+
+    if (op_type == Operation_MUL)
+        return binary_op_scalar_inplace<binary_op_mul>(bottom_top_blob, b, opt);
+
+    if (op_type == Operation_DIV)
+        return binary_op_scalar_inplace<binary_op_div>(bottom_top_blob, b, opt);
+
+    if (op_type == Operation_MAX)
+        return binary_op_scalar_inplace<binary_op_max>(bottom_top_blob, b, opt);
+
+    if (op_type == Operation_MIN)
+        return binary_op_scalar_inplace<binary_op_min>(bottom_top_blob, b, opt);
+
+    if (op_type == Operation_POW)
+        return binary_op_scalar_inplace<binary_op_pow>(bottom_top_blob, b, opt);
+
+    if (op_type == Operation_RSUB)
+        return binary_op_scalar_inplace<binary_op_rsub>(bottom_top_blob, b, opt);
+
+    if (op_type == Operation_RDIV)
+        return binary_op_scalar_inplace<binary_op_rdiv>(bottom_top_blob, b, opt);
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/binaryop_loongarch.h b/src/layer/loongarch/binaryop_loongarch.h
new file mode 100644
index 00000000000..bcf9ef5442f
--- /dev/null
+++ b/src/layer/loongarch/binaryop_loongarch.h
@@ -0,0 +1,34 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_BINARYOP_LOONGARCH_H
+#define LAYER_BINARYOP_LOONGARCH_H
+
+#include "binaryop.h"
+
+namespace ncnn {
+
+class BinaryOp_loongarch : virtual public BinaryOp
+{
+public:
+    BinaryOp_loongarch();
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_BINARYOP_LOONGARCH_H
diff --git a/src/layer/loongarch/cast_loongarch.cpp b/src/layer/loongarch/cast_loongarch.cpp
new file mode 100644
index 00000000000..2e956657f14
--- /dev/null
+++ b/src/layer/loongarch/cast_loongarch.cpp
@@ -0,0 +1,209 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cast_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+Cast_loongarch::Cast_loongarch()
+{
+    support_packing = true;
+}
+
+int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    if (type_from == type_to)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    size_t out_elemsize = elemsize;
+    if (type_to == 1)
+    {
+        if (type_from == 3)
+        {
+            Cast::forward(bottom_blob, top_blob, opt);
+        }
+
+        // float32
+        out_elemsize = 4 * elempack;
+    }
+    else if (type_to == 2)
+    {
+        // float16
+        out_elemsize = 2 * elempack;
+    }
+    else if (type_to == 3)
+    {
+        // int8
+        out_elemsize = elempack;
+    }
+    else if (type_to == 4)
+    {
+        // bfloat16
+        out_elemsize = 2 * elempack;
+    }
+
+    if (dims == 1)
+    {
+        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
+    }
+    else if (dims == 2)
+    {
+        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
+    }
+    else if (dims == 3)
+    {
+        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
+    }
+    else if (dims == 4)
+    {
+        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator);
+    }
+    if (top_blob.empty())
+        return -100;
+
+    int size = w * h * d * elempack;
+
+    if (type_from == 1 && type_to == 2)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            unsigned short* outptr = top_blob.channel(q);
+
+            int i = 0;
+#if __loongarch_sx
+            for (; i + 7 < size; i += 8)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p0 = (__m128)__lsx_vld(ptr, 0);
+                __m128 _p1 = (__m128)__lsx_vld(ptr + 4, 0);
+                __m128i _p = __lsx_vfcvt_h_s(_p1, _p0);
+                __lsx_vst(_p, outptr, 0);
+
+                ptr += 8;
+                outptr += 8;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                *outptr = float32_to_float16(*ptr);
+                outptr++;
+                ptr++;
+            }
+        }
+    }
+
+    if (type_from == 2 && type_to == 1)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const unsigned short* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            int i = 0;
+#if __loongarch_sx
+            for (; i + 7 < size; i += 8)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128i _p = __lsx_vld(ptr, 0);
+                __m128 _p0 = __lsx_vfcvtl_s_h(_p);
+                __m128 _p1 = __lsx_vfcvth_s_h(_p);
+                __lsx_vst(_p0, outptr, 0);
+                __lsx_vst(_p1, outptr + 4, 0);
+
+                ptr += 8;
+                outptr += 8;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                *outptr = float16_to_float32(*ptr);
+                outptr++;
+                ptr++;
+            }
+        }
+    }
+
+    if (type_from == 3 && type_to == 1)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const signed char* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = (float)ptr[i];
+            }
+        }
+    }
+
+    if (type_from == 4 && type_to == 1)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const unsigned short* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            int i = 0;
+            for (; i < size; i++)
+            {
+                *outptr = bfloat16_to_float32(*ptr);
+                outptr++;
+                ptr++;
+            }
+        }
+    }
+
+    if (type_from == 1 && type_to == 4)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            unsigned short* outptr = top_blob.channel(q);
+
+            int i = 0;
+            for (; i < size; i++)
+            {
+                *outptr = float32_to_bfloat16(*ptr);
+                outptr++;
+                ptr++;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/cast_loongarch.h b/src/layer/loongarch/cast_loongarch.h
new file mode 100644
index 00000000000..1fe75c687d8
--- /dev/null
+++ b/src/layer/loongarch/cast_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CAST_LOONGARCH_H
+#define LAYER_CAST_LOONGARCH_H
+
+#include "cast.h"
+
+namespace ncnn {
+
+class Cast_loongarch : virtual public Cast
+{
+public:
+    Cast_loongarch();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CAST_LOONGARCH_H
diff --git a/src/layer/loongarch/clip_loongarch.cpp b/src/layer/loongarch/clip_loongarch.cpp
new file mode 100644
index 00000000000..7cf0246d060
--- /dev/null
+++ b/src/layer/loongarch/clip_loongarch.cpp
@@ -0,0 +1,76 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "clip_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+Clip_loongarch::Clip_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int Clip_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _max = (__m128)__lsx_vreplfr2vr_s(max);
+        __m128 _min = (__m128)__lsx_vreplfr2vr_s(min);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = __lsx_vfmax_s(_p, _min);
+            _p = __lsx_vfmin_s(_p, _max);
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            if (*ptr < min)
+                *ptr = min;
+
+            if (*ptr > max)
+                *ptr = max;
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/clip_loongarch.h b/src/layer/loongarch/clip_loongarch.h
new file mode 100644
index 00000000000..43df62035ff
--- /dev/null
+++ b/src/layer/loongarch/clip_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CLIP_LOONGARCH_H
+#define LAYER_CLIP_LOONGARCH_H
+
+#include "clip.h"
+
+namespace ncnn {
+
+class Clip_loongarch : virtual public Clip
+{
+public:
+    Clip_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CLIP_LOONGARCH_H
diff --git a/src/layer/loongarch/concat_loongarch.cpp b/src/layer/loongarch/concat_loongarch.cpp
new file mode 100644
index 00000000000..50460f8c134
--- /dev/null
+++ b/src/layer/loongarch/concat_loongarch.cpp
@@ -0,0 +1,348 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "concat_loongarch.h"
+
+namespace ncnn {
+
+Concat_loongarch::Concat_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Concat_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    int dims = bottom_blobs[0].dims;
+    int positive_axis = axis < 0 ? dims + axis : axis;
+
+    if (dims == 1) // positive_axis == 0
+    {
+        // concat vector
+        // total length
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+        int top_w = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+            top_w += bottom_blob.w * bottom_blob.elempack;
+        }
+
+        int out_elempack = opt.use_packing_layout && top_w % 4 == 0 ? 4 : 1;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        Mat& top_blob = top_blobs[0];
+        top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        float* outptr = top_blob;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+
+            const float* ptr = bottom_blob;
+            memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);
+
+            outptr += bottom_blob.w * bottom_blob.elempack;
+        }
+    }
+
+    if (dims == 2 && positive_axis == 0)
+    {
+        // concat image
+        int w = bottom_blobs[0].w;
+
+        // total height
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+        int top_h = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+            elemsize = std::min(elemsize, bottom_blob.elemsize);
+            elempack = std::min(elempack, bottom_blob.elempack);
+            top_h += bottom_blob.h * bottom_blob.elempack;
+        }
+
+        int out_elempack = opt.use_packing_layout && top_h % 4 == 0 ? 4 : 1;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        Mat& top_blob = top_blobs[0];
+        top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        Mat top_blob_unpacked = top_blob;
+        if (elempack < out_elempack)
+        {
+            top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
+            if (top_blob_unpacked.empty())
+                return -100;
+        }
+
+        float* outptr = top_blob_unpacked;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+
+            if (bottom_blob.elempack == 4 && elempack == 1)
+            {
+                for (int i = 0; i < bottom_blob.h; i++)
+                {
+                    const float* r0 = bottom_blob.row(i);
+
+                    float* outptr0 = outptr;
+                    float* outptr1 = outptr + w;
+                    float* outptr2 = outptr + w * 2;
+                    float* outptr3 = outptr + w * 3;
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        *outptr0++ = r0[0];
+                        *outptr1++ = r0[1];
+                        *outptr2++ = r0[2];
+                        *outptr3++ = r0[3];
+
+                        r0 += 4;
+                    }
+
+                    outptr += w * 4;
+                }
+            }
+            else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
+            {
+                int size = w * bottom_blob.h;
+
+                const float* ptr = bottom_blob;
+                memcpy(outptr, ptr, size * bottom_blob.elemsize);
+
+                outptr += size * bottom_blob.elempack;
+            }
+        }
+
+        // packing
+        if (elempack < out_elempack)
+        {
+            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+        }
+    }
+
+    if (dims == 2 && positive_axis == 1)
+    {
+        // interleave image row
+        int h = bottom_blobs[0].h;
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+
+        // total width
+        int top_w = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+            top_w += bottom_blob.w;
+        }
+
+        Mat& top_blob = top_blobs[0];
+        top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            float* outptr = top_blob.row(i);
+            for (size_t b = 0; b < bottom_blobs.size(); b++)
+            {
+                const Mat& bottom_blob = bottom_blobs[b];
+
+                const float* ptr = bottom_blob.row(i);
+                memcpy(outptr, ptr, bottom_blob.w * elemsize);
+
+                outptr += bottom_blob.w * elempack;
+            }
+        }
+    }
+
+    if (dims == 3 && positive_axis == 0)
+    {
+        // concat dim
+        int w = bottom_blobs[0].w;
+        int h = bottom_blobs[0].h;
+
+        // total channels
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+        int top_channels = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+            elemsize = std::min(elemsize, bottom_blob.elemsize);
+            elempack = std::min(elempack, bottom_blob.elempack);
+            top_channels += bottom_blob.c * bottom_blob.elempack;
+        }
+
+        int out_elempack = opt.use_packing_layout && top_channels % 4 == 0 ? 4 : 1;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        Mat& top_blob = top_blobs[0];
+        top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        Mat top_blob_unpacked = top_blob;
+        if (elempack < out_elempack)
+        {
+            top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
+            if (top_blob_unpacked.empty())
+                return -100;
+        }
+
+        int p = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+
+            if (bottom_blob.elempack == 4 && elempack == 1)
+            {
+                int size = bottom_blob.w * bottom_blob.h;
+
+                for (int q = 0; q < bottom_blob.c; q++)
+                {
+                    const float* r0 = bottom_blob.channel(q);
+
+                    float* outptr0 = top_blob_unpacked.channel(p);
+                    float* outptr1 = top_blob_unpacked.channel(p + 1);
+                    float* outptr2 = top_blob_unpacked.channel(p + 2);
+                    float* outptr3 = top_blob_unpacked.channel(p + 3);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        *outptr0++ = r0[0];
+                        *outptr1++ = r0[1];
+                        *outptr2++ = r0[2];
+                        *outptr3++ = r0[3];
+
+                        r0 += 4;
+                    }
+
+                    p += 4;
+                }
+            }
+            else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
+            {
+                int size = bottom_blob.total();
+
+                const float* ptr = bottom_blob;
+                float* outptr = top_blob_unpacked.channel(p);
+                memcpy(outptr, ptr, size * bottom_blob.elemsize);
+
+                p += bottom_blob.c;
+            }
+        }
+
+        // packing
+        if (elempack < out_elempack)
+        {
+            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+        }
+    }
+
+    if (dims == 3 && positive_axis == 1)
+    {
+        // interleave dim height
+        int w = bottom_blobs[0].w;
+        int channels = bottom_blobs[0].c;
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+
+        // total height
+        int top_h = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+            top_h += bottom_blob.h;
+        }
+
+        Mat& top_blob = top_blobs[0];
+        top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            float* outptr = top_blob.channel(q);
+
+            for (size_t b = 0; b < bottom_blobs.size(); b++)
+            {
+                const Mat& bottom_blob = bottom_blobs[b];
+
+                int size = bottom_blob.w * bottom_blob.h;
+
+                const float* ptr = bottom_blob.channel(q);
+                memcpy(outptr, ptr, size * elemsize);
+
+                outptr += size * elempack;
+            }
+        }
+    }
+
+    if (dims == 3 && positive_axis == 2)
+    {
+        // interleave dim width
+        int h = bottom_blobs[0].h;
+        int channels = bottom_blobs[0].c;
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+
+        // total height
+        int top_w = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob = bottom_blobs[b];
+            top_w += bottom_blob.w;
+        }
+
+        Mat& top_blob = top_blobs[0];
+        top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            float* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < h; i++)
+            {
+                for (size_t b = 0; b < bottom_blobs.size(); b++)
+                {
+                    const Mat& bottom_blob = bottom_blobs[b];
+
+                    const float* ptr = bottom_blob.channel(q).row(i);
+                    memcpy(outptr, ptr, bottom_blob.w * elemsize);
+
+                    outptr += bottom_blob.w * elempack;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/concat_loongarch.h b/src/layer/loongarch/concat_loongarch.h
new file mode 100644
index 00000000000..934c85244df
--- /dev/null
+++ b/src/layer/loongarch/concat_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CONCAT_LOONGARCH_H
+#define LAYER_CONCAT_LOONGARCH_H
+
+#include "concat.h"
+
+namespace ncnn {
+
+class Concat_loongarch : virtual public Concat
+{
+public:
+    Concat_loongarch();
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CONCAT_LOONGARCH_H
diff --git a/src/layer/loongarch/convolution1d_loongarch.cpp b/src/layer/loongarch/convolution1d_loongarch.cpp
new file mode 100644
index 00000000000..0b1a11c868f
--- /dev/null
+++ b/src/layer/loongarch/convolution1d_loongarch.cpp
@@ -0,0 +1,379 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution1d_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_activation.h"
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+Convolution1D_loongarch::Convolution1D_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Convolution1D_loongarch::create_pipeline(const Option& opt)
+{
+    if (dynamic_weight)
+        return 0;
+
+    const int num_input = weight_data_size / kernel_w / num_output;
+
+    int elempack = 1;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        elempack = num_input % 4 == 0 ? 4 : 1;
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif
+
+    // src = kw-inch-outch
+    // dst = pb-pa-kw-inch/pa-outch/pb
+    {
+        Mat weight_data_r2 = weight_data.reshape(kernel_w, num_input, num_output);
+
+        weight_data_packed.create(kernel_w, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            float* g00 = weight_data_packed.channel(q / out_elempack);
+
+            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
+            {
+                for (int k = 0; k < kernel_w; k++)
+                {
+                    for (int i = 0; i < elempack; i++)
+                    {
+                        for (int j = 0; j < out_elempack; j++)
+                        {
+                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);
+
+                            g00[0] = k00[k];
+
+                            g00++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+int Convolution1D_loongarch::destroy_pipeline(const Option& /*opt*/)
+{
+    return 0;
+}
+
+int Convolution1D_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    w = bottom_blob_bordered.w;
+    h = bottom_blob_bordered.h;
+
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    const int outw = (w - kernel_extent_w) / stride_w + 1;
+    const int outh = num_output / out_elempack;
+
+    top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+#if __loongarch_sx
+    if (elempack == 4 && out_elempack == 4)
+    {
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p = 0; p < outh; p++)
+            {
+                float* outptr = top_blob.row(p);
+
+                for (int j = 0; j < outw; j++)
+                {
+                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    if (bias_term)
+                    {
+                        _sum = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
+                    }
+
+                    const float* kptr = weight_data_packed.channel(p);
+
+                    for (int q = 0; q < h; q++)
+                    {
+                        const float* sptr = bottom_blob_bordered.row(q) + j * stride_w * 4;
+
+                        for (int k = 0; k < kernel_w; k++)
+                        {
+                            __m128 _val0 = __lsx_vreplfr2vr_s(sptr[0]);
+                            __m128 _val1 = __lsx_vreplfr2vr_s(sptr[1]);
+                            __m128 _val2 = __lsx_vreplfr2vr_s(sptr[2]);
+                            __m128 _val3 = __lsx_vreplfr2vr_s(sptr[3]);
+
+                            __m128 _w0 = (__m128)__lsx_vld(kptr, 0);
+                            __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0);
+                            __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0);
+                            __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0);
+
+                            _sum = __lsx_vfmadd_s(_w0, _val0, _sum);
+                            _sum = __lsx_vfmadd_s(_w1, _val1, _sum);
+                            _sum = __lsx_vfmadd_s(_w2, _val2, _sum);
+                            _sum = __lsx_vfmadd_s(_w3, _val3, _sum);
+
+                            sptr += dilation_w * 4;
+                            kptr += 16;
+                        }
+                    }
+
+                    _sum = activation_ps(_sum, activation_type, activation_params);
+
+                    __lsx_vst(_sum, outptr, 0);
+                    outptr += 4;
+                }
+            }
+        }
+    }
+
+    if (elempack == 1 && out_elempack == 4)
+    {
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p = 0; p < outh; p++)
+            {
+                float* outptr = top_blob.row(p);
+
+                for (int j = 0; j < outw; j++)
+                {
+                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    if (bias_term)
+                    {
+                        _sum = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
+                    }
+
+                    const float* kptr = weight_data_packed.channel(p);
+
+                    for (int q = 0; q < h; q++)
+                    {
+                        const float* sptr = bottom_blob_bordered.row(q) + j * stride_w;
+
+                        for (int k = 0; k < kernel_w; k++)
+                        {
+                            __m128 _val = __lsx_vreplfr2vr_s(sptr[0]);
+                            __m128 _w = (__m128)__lsx_vld(kptr, 0);
+                            _sum = __lsx_vfmadd_s(_w, _val, _sum);
+
+                            sptr += dilation_w;
+                            kptr += 4;
+                        }
+                    }
+
+                    _sum = activation_ps(_sum, activation_type, activation_params);
+
+                    __lsx_vst(_sum, outptr, 0);
+                    outptr += 4;
+                }
+            }
+        }
+    }
+
+    if (elempack == 4 && out_elempack == 1)
+    {
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p = 0; p < outh; p++)
+            {
+                float* outptr = top_blob.row(p);
+
+                for (int j = 0; j < outw; j++)
+                {
+                    float sum = 0.f;
+
+                    if (bias_term)
+                    {
+                        sum = bias_data[p];
+                    }
+
+                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    const float* kptr = weight_data_packed.channel(p);
+
+                    for (int q = 0; q < h; q++)
+                    {
+                        const float* sptr = bottom_blob_bordered.row(q) + j * stride_w * 4;
+
+                        for (int k = 0; k < kernel_w; k++)
+                        {
+                            __m128 _val = (__m128)__lsx_vld(sptr, 0);
+                            __m128 _w = (__m128)__lsx_vld(kptr, 0);
+                            _sum = __lsx_vfmadd_s(_w, _val, _sum);
+
+                            sptr += dilation_w * 4;
+                            kptr += 4;
+                        }
+                    }
+
+                    sum += __lsx_reduce_fadd_s(_sum);
+
+                    sum = activation_ss(sum, activation_type, activation_params);
+
+                    outptr[j] = sum;
+                }
+            }
+        }
+    }
+#endif // __loongarch_sx
+
+    if (elempack == 1 && out_elempack == 1)
+    {
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p = 0; p < outh; p++)
+            {
+                float* outptr = top_blob.row(p);
+
+                for (int j = 0; j < outw; j++)
+                {
+                    float sum = 0.f;
+
+                    if (bias_term)
+                    {
+                        sum = bias_data[p];
+                    }
+
+                    const float* kptr = (const float*)weight_data + kernel_w * h * p;
+
+                    for (int q = 0; q < h; q++)
+                    {
+                        const float* sptr = bottom_blob_bordered.row(q) + j * stride_w;
+
+                        for (int k = 0; k < kernel_w; k++)
+                        {
+                            float val = sptr[0];
+                            float wt = kptr[0];
+                            sum += val * wt;
+
+                            sptr += dilation_w;
+                            kptr += 1;
+                        }
+                    }
+
+                    sum = activation_ss(sum, activation_type, activation_params);
+
+                    outptr[j] = sum;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+int Convolution1D_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& _weight_data = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    const int _kernel_w = _weight_data.w;
+    const int _num_output = _weight_data.c * _weight_data.elempack;
+
+    Mat weight_data_flattened;
+    flatten(_weight_data, weight_data_flattened, opt);
+    if (weight_data_flattened.empty())
+        return -100;
+
+    // weight_data_flattened as pack1
+    weight_data_flattened.w *= weight_data_flattened.elempack;
+    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+    weight_data_flattened.elempack = 1;
+
+    Mat bias_data_flattened;
+    if (bias_term)
+    {
+        const Mat& _bias_data = bottom_blobs[2];
+        flatten(_bias_data, bias_data_flattened, opt);
+        if (bias_data_flattened.empty())
+            return -100;
+
+        // bias_data_flattened as pack1
+        bias_data_flattened.w *= bias_data_flattened.elempack;
+        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
+        bias_data_flattened.elempack = 1;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D);
+
+    ncnn::ParamDict pd;
+    pd.set(0, _num_output);
+    pd.set(1, _kernel_w);
+    pd.set(2, dilation_w);
+    pd.set(3, stride_w);
+    pd.set(4, pad_left);
+    pd.set(15, pad_right);
+    pd.set(18, pad_value);
+    pd.set(5, bias_term);
+    pd.set(6, weight_data_flattened.w);
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    op->load_param(pd);
+
+    ncnn::Mat weights[2];
+    weights[0] = weight_data_flattened;
+    weights[1] = bias_data_flattened;
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    op->create_pipeline(opt);
+
+    op->forward(bottom_blob, top_blob, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/convolution1d_loongarch.h b/src/layer/loongarch/convolution1d_loongarch.h
new file mode 100644
index 00000000000..36393df4568
--- /dev/null
+++ b/src/layer/loongarch/convolution1d_loongarch.h
@@ -0,0 +1,41 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CONVOLUTION1D_LOONGARCH_H
+#define LAYER_CONVOLUTION1D_LOONGARCH_H
+
+#include "convolution1d.h"
+
+namespace ncnn {
+
+class Convolution1D_loongarch : virtual public Convolution1D
+{
+public:
+    Convolution1D_loongarch();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+public:
+    // packn
+    Mat weight_data_packed;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CONVOLUTION1D_LOONGARCH_H
diff --git a/src/layer/loongarch/convolution_1x1.h b/src/layer/loongarch/convolution_1x1.h
new file mode 100644
index 00000000000..83d3778411a
--- /dev/null
+++ b/src/layer/loongarch/convolution_1x1.h
@@ -0,0 +1,26 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/loongarch/convolution_1x1_int8.h b/src/layer/loongarch/convolution_1x1_int8.h
new file mode 100644
index 00000000000..08f439c484a
--- /dev/null
+++ b/src/layer/loongarch/convolution_1x1_int8.h
@@ -0,0 +1,83 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_int8_lsx(bottom_im2col, top_blob, kernel, opt);
+}
+
+static void conv1x1s2_sgemm_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int tailstep = w - 2 * outw + w;
+
+    Mat bottom_blob_shrinked;
+    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const signed char* r0 = bottom_blob.channel(p);
+        signed char* outptr = bottom_blob_shrinked.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            int j = 0;
+            for (; j + 3 < outw; j += 4)
+            {
+                outptr[0] = r0[0];
+                outptr[1] = r0[2];
+                outptr[2] = r0[4];
+                outptr[3] = r0[6];
+
+                r0 += 8;
+                outptr += 4;
+            }
+            for (; j + 1 < outw; j += 2)
+            {
+                outptr[0] = r0[0];
+                outptr[1] = r0[2];
+
+                r0 += 4;
+                outptr += 2;
+            }
+            for (; j < outw; j++)
+            {
+                outptr[0] = r0[0];
+
+                r0 += 2;
+                outptr += 1;
+            }
+
+            r0 += tailstep;
+        }
+    }
+
+    conv1x1s1_sgemm_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt);
+}
diff --git a/src/layer/loongarch/convolution_1x1_pack1to4_int8.h b/src/layer/loongarch/convolution_1x1_pack1to4_int8.h
new file mode 100644
index 00000000000..00e1e258141
--- /dev/null
+++ b/src/layer/loongarch/convolution_1x1_pack1to4_int8.h
@@ -0,0 +1,83 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_pack1to4_int8_lsx(bottom_im2col, top_blob, kernel, opt);
+}
+
+static void conv1x1s2_sgemm_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int tailstep = w - 2 * outw + w;
+
+    Mat bottom_blob_shrinked;
+    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const signed char* r0 = bottom_blob.channel(p);
+        signed char* outptr = bottom_blob_shrinked.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            int j = 0;
+            for (; j + 3 < outw; j += 4)
+            {
+                outptr[0] = r0[0];
+                outptr[1] = r0[2];
+                outptr[2] = r0[4];
+                outptr[3] = r0[6];
+
+                r0 += 8;
+                outptr += 4;
+            }
+            for (; j + 1 < outw; j += 2)
+            {
+                outptr[0] = r0[0];
+                outptr[1] = r0[2];
+
+                r0 += 4;
+                outptr += 2;
+            }
+            for (; j < outw; j++)
+            {
+                outptr[0] = r0[0];
+
+                r0 += 2;
+                outptr += 1;
+            }
+
+            r0 += tailstep;
+        }
+    }
+
+    conv1x1s1_sgemm_pack1to4_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt);
+}
diff --git a/src/layer/loongarch/convolution_1x1_pack4.h b/src/layer/loongarch/convolution_1x1_pack4.h
new file mode 100644
index 00000000000..cf5a5b8e363
--- /dev/null
+++ b/src/layer/loongarch/convolution_1x1_pack4.h
@@ -0,0 +1,65 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_pack4_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
+}
+
+static void conv1x1s2_sgemm_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int tailstep = (w - 2 * outw + w) * 4;
+
+    Mat bottom_blob_shrinked;
+    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const float* r0 = bottom_blob.channel(p);
+        float* outptr = bottom_blob_shrinked.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128 _val = (__m128)__lsx_vld(r0, 0);
+                __lsx_vst(_val, outptr, 0);
+
+                r0 += 4 * 2;
+                outptr += 4;
+            }
+
+            r0 += tailstep;
+        }
+    }
+
+    conv1x1s1_sgemm_pack4_lsx(bottom_blob_shrinked, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/loongarch/convolution_1x1_pack4to1.h b/src/layer/loongarch/convolution_1x1_pack4to1.h
new file mode 100644
index 00000000000..b87129091e4
--- /dev/null
+++ b/src/layer/loongarch/convolution_1x1_pack4to1.h
@@ -0,0 +1,65 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_pack4to1_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
+}
+
+static void conv1x1s2_sgemm_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int tailstep = (w - 2 * outw + w) * 4;
+
+    Mat bottom_blob_shrinked;
+    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const float* r0 = bottom_blob.channel(p);
+        float* outptr = bottom_blob_shrinked.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128 _val = (__m128)__lsx_vld(r0, 0);
+                __lsx_vst(_val, outptr, 0);
+
+                r0 += 4 * 2;
+                outptr += 4;
+            }
+
+            r0 += tailstep;
+        }
+    }
+
+    conv1x1s1_sgemm_pack4to1_lsx(bottom_blob_shrinked, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/loongarch/convolution_1x1_pack8to1_int8.h b/src/layer/loongarch/convolution_1x1_pack8to1_int8.h
new file mode 100644
index 00000000000..8df0e128b7f
--- /dev/null
+++ b/src/layer/loongarch/convolution_1x1_pack8to1_int8.h
@@ -0,0 +1,65 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_pack8to1_int8_lsx(bottom_im2col, top_blob, kernel, opt);
+}
+
+static void conv1x1s2_sgemm_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int tailstep = w - 2 * outw + w;
+
+    Mat bottom_blob_shrinked;
+    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const int64_t* r0 = bottom_blob.channel(p);
+        int64_t* outptr = bottom_blob_shrinked.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            int j = 0;
+            for (; j < outw; j++)
+            {
+                outptr[0] = r0[0];
+
+                r0 += 2;
+                outptr += 1;
+            }
+
+            r0 += tailstep;
+        }
+    }
+
+    conv1x1s1_sgemm_pack8to1_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt);
+}
diff --git a/src/layer/loongarch/convolution_1x1_pack8to4_int8.h b/src/layer/loongarch/convolution_1x1_pack8to4_int8.h
new file mode 100644
index 00000000000..6aaa720d23d
--- /dev/null
+++ b/src/layer/loongarch/convolution_1x1_pack8to4_int8.h
@@ -0,0 +1,65 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_pack8to4_int8_lsx(bottom_im2col, top_blob, kernel, opt);
+}
+
+static void conv1x1s2_sgemm_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int tailstep = w - 2 * outw + w;
+
+    Mat bottom_blob_shrinked;
+    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const int64_t* r0 = bottom_blob.channel(p);
+        int64_t* outptr = bottom_blob_shrinked.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            int j = 0;
+            for (; j < outw; j++)
+            {
+                outptr[0] = r0[0];
+
+                r0 += 2;
+                outptr += 1;
+            }
+
+            r0 += tailstep;
+        }
+    }
+
+    conv1x1s1_sgemm_pack8to4_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt);
+}
diff --git a/src/layer/loongarch/convolution_3x3.h b/src/layer/loongarch/convolution_3x3.h
new file mode 100644
index 00000000000..66e10106b46
--- /dev/null
+++ b/src/layer/loongarch/convolution_3x3.h
@@ -0,0 +1,412 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd23_transform_kernel_lsx(const Mat& kernel, Mat& kernel_tm2, int inch, int outch, const Option& opt)
+{
+    Mat kernel_tm(4 * 4, inch, outch);
+
+    // G
+    const float ktm[4][3] = {
+        {1.0f, 0.0f, 0.0f},
+        {1.0f / 2, 1.0f / 2, 1.0f / 2},
+        {1.0f / 2, -1.0f / 2, 1.0f / 2},
+        {0.0f, 0.0f, 1.0f}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
+            float* kernel_tm0 = kernel_tm.channel(p).row(q);
+
+            // transform kernel
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 3;
+            const float* k2 = kernel0 + 6;
+
+            // h
+            float tmp[4][3];
+            for (int i = 0; i < 4; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 4; j++)
+            {
+                float* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 4; i++)
+                {
+                    kernel_tm0[j * 4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 16-inch-outch
+    // dst = inch-16-outch
+#if __loongarch_sx
+    kernel_tm2.create(8 * inch, 16, outch / 8 + (outch % 8) / 4 + outch % 4);
+#else
+    kernel_tm2.create(2 * inch, 16, outch / 2 + outch % 2);
+#endif
+
+    int q = 0;
+#if __loongarch_sx
+    for (; q + 7 < outch; q += 8)
+    {
+        Mat g0 = kernel_tm2.channel(q / 8);
+
+        for (int k = 0; k < 16; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p < inch; p++)
+            {
+                for (int i = 0; i < 8; i++)
+                {
+                    const float* k00 = kernel_tm.channel(q + i).row(p);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+    }
+    for (; q + 3 < outch; q += 4)
+    {
+        Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4);
+
+        for (int k = 0; k < 16; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p < inch; p++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    const float* k00 = kernel_tm.channel(q + i).row(p);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+    }
+#else  // __loongarch_sx
+    for (; q + 1 < outch; q += 2)
+    {
+        Mat g0 = kernel_tm2.channel(q / 2);
+
+        for (int k = 0; k < 16; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p < inch; p++)
+            {
+                for (int i = 0; i < 2; i++)
+                {
+                    const float* k00 = kernel_tm.channel(q + i).row(p);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+    }
+#endif // __loongarch_sx
+    for (; q < outch; q++)
+    {
+#if __loongarch_sx
+        Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4 + q % 4);
+#else
+        Mat g0 = kernel_tm2.channel(q / 2 + q % 2);
+#endif
+
+        for (int k = 0; k < 16; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p < inch; p++)
+            {
+                const float* k00 = kernel_tm.channel(q).row(p);
+                g00[0] = k00[k];
+                g00++;
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 2n+2, winograd F(2,3)
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 1) / 2 * 2;
+    outh = (outh + 1) / 2 * 2;
+
+    w = outw + 2;
+    h = outh + 2;
+    Option opt_b = opt;
+    opt_b.blob_allocator = opt.workspace_allocator;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tiles = outw / 2;
+        int h_tiles = outh / 2;
+        int tiles = w_tiles * h_tiles;
+
+        bottom_blob_tm.create(tiles, 16, inch, 4u, opt.workspace_allocator);
+        conv3x3s1_winograd23_transform_input_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    convolution_winograd_dot_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
+    }
+    {
+        conv3x3s1_winograd23_transform_output_lsx(top_blob_tm, top_blob_bordered, bias, opt);
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
+
+static void conv3x3s1_winograd43_transform_kernel_lsx(const Mat& kernel, Mat& kernel_tm2, int inch, int outch, const Option& opt)
+{
+    Mat kernel_tm(6 * 6, inch, outch);
+
+    // G
+    const float ktm[6][3] = {
+        {1.0f / 4, 0.0f, 0.0f},
+        {-1.0f / 6, -1.0f / 6, -1.0f / 6},
+        {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+        {1.0f / 24, 1.0f / 12, 1.0f / 6},
+        {1.0f / 24, -1.0f / 12, 1.0f / 6},
+        {0.0f, 0.0f, 1.0f}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
+            float* kernel_tm0 = kernel_tm.channel(p).row(q);
+
+            // transform kernel
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 3;
+            const float* k2 = kernel0 + 6;
+
+            // h
+            float tmp[6][3];
+            for (int i = 0; i < 6; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++)
+            {
+                float* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++)
+                {
+                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 36-inch-outch
+    // dst = inch-36-outch
+#if __loongarch_sx
+    kernel_tm2.create(8 * inch, 36, outch / 8 + (outch % 8) / 4 + outch % 4);
+#else
+    kernel_tm2.create(2 * inch, 36, outch / 2 + outch % 2);
+#endif
+
+    int q = 0;
+#if __loongarch_sx
+    for (; q + 7 < outch; q += 8)
+    {
+        Mat g0 = kernel_tm2.channel(q / 8);
+
+        for (int k = 0; k < 36; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p < inch; p++)
+            {
+                for (int i = 0; i < 8; i++)
+                {
+                    const float* k00 = kernel_tm.channel(q + i).row(p);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+    }
+    for (; q + 3 < outch; q += 4)
+    {
+        Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4);
+
+        for (int k = 0; k < 36; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p < inch; p++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    const float* k00 = kernel_tm.channel(q + i).row(p);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+    }
+#else  // __loongarch_sx
+    for (; q + 1 < outch; q += 2)
+    {
+        Mat g0 = kernel_tm2.channel(q / 2);
+
+        for (int k = 0; k < 36; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p < inch; p++)
+            {
+                for (int i = 0; i < 2; i++)
+                {
+                    const float* k00 = kernel_tm.channel(q + i).row(p);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+    }
+#endif // __loongarch_sx
+    for (; q < outch; q++)
+    {
+#if __loongarch_sx
+        Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4 + q % 4);
+#else
+        Mat g0 = kernel_tm2.channel(q / 2 + q % 2);
+#endif
+
+        for (int k = 0; k < 36; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p < inch; p++)
+            {
+                const float* k00 = kernel_tm.channel(q).row(p);
+                g00[0] = k00[k];
+                g00++;
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 4n+2, winograd F(4,3)
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 3) / 4 * 4;
+    outh = (outh + 3) / 4 * 4;
+
+    w = outw + 2;
+    h = outh + 2;
+
+    Option opt_b = opt;
+    opt_b.blob_allocator = opt.workspace_allocator;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tiles = outw / 4;
+        int h_tiles = outh / 4;
+        int tiles = w_tiles * h_tiles;
+
+        bottom_blob_tm.create(tiles, 36, inch, 4u, opt.workspace_allocator);
+        conv3x3s1_winograd43_transform_input_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    convolution_winograd_dot_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
+    }
+    {
+        conv3x3s1_winograd43_transform_output_lsx(top_blob_tm, top_blob_bordered, bias, opt);
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
diff --git a/src/layer/loongarch/convolution_3x3_int8.h b/src/layer/loongarch/convolution_3x3_int8.h
new file mode 100644
index 00000000000..3ea28dd0944
--- /dev/null
+++ b/src/layer/loongarch/convolution_3x3_int8.h
@@ -0,0 +1,252 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd43_transform_kernel_int8_lsx(const Mat& kernel, Mat& kernel_tm_packed, int inch, int outch, const Option& opt)
+{
+    // winograd43 transform kernel
+    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);
+
+    const short ktm[6][3] = {
+        {6, 0, 0},
+        {-4, -4, -4},
+        {-4, 4, -4},
+        {1, 2, 4},
+        {1, -2, 4},
+        {0, 0, 6}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
+            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
+
+            // transform kernel
+            const signed char* k0 = kernel0;
+            const signed char* k1 = kernel0 + 3;
+            const signed char* k2 = kernel0 + 6;
+
+            // h
+            short tmp[6][3];
+            for (int i = 0; i < 6; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++)
+            {
+                short* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++)
+                {
+                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 36-inch-outch
+    // dst = 2b-inch-36-outch/2b
+#if __loongarch_sx
+    if (outch >= 4)
+    {
+        if (inch >= 4)
+            kernel_tm_packed.create(inch / 4 + inch % 4, 36, outch / 4 + outch % 4, (size_t)2u * 16, 16);
+        else
+            kernel_tm_packed.create(inch, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4);
+    }
+#else  // __loongarch_sx
+    if (outch >= 2)
+    {
+        kernel_tm_packed.create(inch, 36, outch / 2 + outch % 2, (size_t)2u * 2, 2);
+    }
+#endif // __loongarch_sx
+    else
+    {
+#if __loongarch_sx
+        if (inch >= 4)
+            kernel_tm_packed.create(inch / 4 + inch % 4, 36, outch, (size_t)2u * 4, 4);
+        else
+#endif // __loongarch_sx
+        {
+            kernel_tm_packed.create(inch, 36, outch, (size_t)2u, 1);
+        }
+    }
+
+    int p = 0;
+#if __loongarch_sx
+    for (; p + 3 < outch; p += 4)
+    {
+        const Mat k0 = kernel_tm.channel(p);
+        const Mat k1 = kernel_tm.channel(p + 1);
+        const Mat k2 = kernel_tm.channel(p + 2);
+        const Mat k3 = kernel_tm.channel(p + 3);
+
+        Mat g0 = kernel_tm_packed.channel(p / 4);
+
+        for (int k = 0; k < 36; k++)
+        {
+            short* g00 = g0.row<short>(k);
+
+            int q = 0;
+            for (; q + 3 < inch; q += 4)
+            {
+                g00[0] = k0.row<const short>(q)[k];
+                g00[1] = k0.row<const short>(q + 1)[k];
+                g00[2] = k0.row<const short>(q + 2)[k];
+                g00[3] = k0.row<const short>(q + 3)[k];
+                g00[4] = k1.row<const short>(q)[k];
+                g00[5] = k1.row<const short>(q + 1)[k];
+                g00[6] = k1.row<const short>(q + 2)[k];
+                g00[7] = k1.row<const short>(q + 3)[k];
+                g00[8] = k2.row<const short>(q)[k];
+                g00[9] = k2.row<const short>(q + 1)[k];
+                g00[10] = k2.row<const short>(q + 2)[k];
+                g00[11] = k2.row<const short>(q + 3)[k];
+                g00[12] = k3.row<const short>(q)[k];
+                g00[13] = k3.row<const short>(q + 1)[k];
+                g00[14] = k3.row<const short>(q + 2)[k];
+                g00[15] = k3.row<const short>(q + 3)[k];
+                g00 += 16;
+            }
+            for (; q < inch; q++)
+            {
+                g00[0] = k0.row<const short>(q)[k];
+                g00[1] = k1.row<const short>(q)[k];
+                g00[2] = k2.row<const short>(q)[k];
+                g00[3] = k3.row<const short>(q)[k];
+                g00 += 4;
+            }
+        }
+    }
+#else  // __loongarch_sx
+    for (; p + 1 < outch; p += 2)
+    {
+        const Mat k0 = kernel_tm.channel(p);
+        const Mat k1 = kernel_tm.channel(p + 1);
+
+        Mat g0 = kernel_tm_packed.channel(p / 2);
+
+        for (int k = 0; k < 36; k++)
+        {
+            short* g00 = g0.row<short>(k);
+
+            int q = 0;
+            for (; q < inch; q++)
+            {
+                g00[0] = k0.row<const short>(q)[k];
+                g00[1] = k1.row<const short>(q)[k];
+                g00 += 2;
+            }
+        }
+    }
+#endif // __loongarch_sx
+    for (; p < outch; p++)
+    {
+        const Mat k0 = kernel_tm.channel(p);
+
+#if __loongarch_sx
+        Mat g0 = kernel_tm_packed.channel(p / 4 + p % 4);
+#else
+        Mat g0 = kernel_tm_packed.channel(p / 2 + p % 2);
+#endif
+
+        for (int k = 0; k < 36; k++)
+        {
+            short* g00 = g0.row<short>(k);
+
+            int q = 0;
+#if __loongarch_sx
+            for (; q + 3 < inch; q += 4)
+            {
+                g00[0] = k0.row<const short>(q)[k];
+                g00[1] = k0.row<const short>(q + 1)[k];
+                g00[2] = k0.row<const short>(q + 2)[k];
+                g00[3] = k0.row<const short>(q + 3)[k];
+                g00 += 4;
+            }
+#endif // __loongarch_sx
+            for (; q < inch; q++)
+            {
+                g00[0] = k0.row<const short>(q)[k];
+                g00 += 1;
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+    //     size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 4n+2
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 3) / 4 * 4;
+    outh = (outh + 3) / 4 * 4;
+
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tiles = outw / 4;
+        int h_tiles = outh / 4;
+        const int tiles = w_tiles * h_tiles;
+
+        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);
+        conv3x3s1_winograd43_transform_input_int8_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    convolution_winograd_dot_int8_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator);
+    }
+    {
+        conv3x3s1_winograd43_transform_output_int8_lsx(top_blob_tm, top_blob_bordered, opt);
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
diff --git a/src/layer/loongarch/convolution_3x3_pack1to4.h b/src/layer/loongarch/convolution_3x3_pack1to4.h
new file mode 100644
index 00000000000..2bcb0ce166d
--- /dev/null
+++ b/src/layer/loongarch/convolution_3x3_pack1to4.h
@@ -0,0 +1,812 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int inch = bottom_blob.c;
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out0 = top_blob.channel(p);
+
+        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+        out0.fill(_bias0);
+
+        const float* k0 = kernel.channel(p);
+
+        int q = 0;
+        for (; q < inch; q++)
+        {
+            float* outptr0 = out0;
+
+            const Mat img0 = bottom_blob.channel(q);
+
+            const float* r0 = img0.row(0);
+            const float* r1 = img0.row(1);
+            const float* r2 = img0.row(2);
+
+            __m128 _k00 = (__m128)__lsx_vld(k0, 0);
+            __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
+            __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+            __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+            __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+            __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0);
+            __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0);
+            __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0);
+            __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0);
+
+            int i = 0;
+            for (; i < outh; i++)
+            {
+                int j = 0;
+                for (; j + 7 < outw; j += 8)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
+                    __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0);
+                    __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0);
+                    __m128 _sum4 = (__m128)__lsx_vld(outptr0 + 4 * 4, 0);
+                    __m128 _sum5 = (__m128)__lsx_vld(outptr0 + 4 * 5, 0);
+                    __m128 _sum6 = (__m128)__lsx_vld(outptr0 + 4 * 6, 0);
+                    __m128 _sum7 = (__m128)__lsx_vld(outptr0 + 4 * 7, 0);
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r0n = __lsx_vld(r0 + 4, 0);
+                    __m128i _r0nn = __lsx_vld(r0 + 8, 0);
+
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
+                    __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0);
+                    __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1);
+                    __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2);
+                    __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3);
+                    __m128 _r08 = (__m128)__lsx_vreplvei_w(_r0nn, 0);
+                    __m128 _r09 = (__m128)__lsx_vreplvei_w(_r0nn, 1);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k00, _r01, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k00, _r02, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k00, _r03, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k00, _r04, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k00, _r05, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k00, _r06, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k00, _r07, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k01, _r02, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k01, _r03, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k01, _r04, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k01, _r05, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k01, _r06, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k01, _r07, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k01, _r08, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k02, _r03, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k02, _r04, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k02, _r05, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k02, _r06, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k02, _r07, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k02, _r08, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k02, _r09, _sum7);
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128i _r1n = __lsx_vld(r1 + 4, 0);
+                    __m128i _r1nn = __lsx_vld(r1 + 8, 0);
+
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
+                    __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0);
+                    __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1);
+                    __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2);
+                    __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3);
+                    __m128 _r18 = (__m128)__lsx_vreplvei_w(_r1nn, 0);
+                    __m128 _r19 = (__m128)__lsx_vreplvei_w(_r1nn, 1);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k10, _r11, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k10, _r12, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k10, _r13, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k10, _r14, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k10, _r15, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k10, _r16, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k10, _r17, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k11, _r12, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k11, _r13, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k11, _r14, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k11, _r15, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k11, _r16, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k11, _r17, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k11, _r18, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k12, _r13, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k12, _r14, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k12, _r15, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k12, _r16, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k12, _r17, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k12, _r18, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k12, _r19, _sum7);
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128i _r2n = __lsx_vld(r2 + 4, 0);
+                    __m128i _r2nn = __lsx_vld(r2 + 8, 0);
+
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
+                    __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0);
+                    __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1);
+                    __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2);
+                    __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3);
+                    __m128 _r28 = (__m128)__lsx_vreplvei_w(_r2nn, 0);
+                    __m128 _r29 = (__m128)__lsx_vreplvei_w(_r2nn, 1);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k20, _r21, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k20, _r22, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k20, _r23, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k20, _r24, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k20, _r25, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k20, _r26, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k20, _r27, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k21, _r22, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k21, _r23, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k21, _r24, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k21, _r25, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k21, _r26, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k21, _r27, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k21, _r28, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k22, _r23, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k22, _r24, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k22, _r25, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k22, _r26, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k22, _r27, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k22, _r28, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k22, _r29, _sum7);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+                    __lsx_vst(_sum1, outptr0 + 4, 0);
+                    __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
+                    __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
+                    __lsx_vst(_sum4, outptr0 + 4 * 4, 0);
+                    __lsx_vst(_sum5, outptr0 + 4 * 5, 0);
+                    __lsx_vst(_sum6, outptr0 + 4 * 6, 0);
+                    __lsx_vst(_sum7, outptr0 + 4 * 7, 0);
+
+                    outptr0 += 4 * 8;
+
+                    r0 += 8;
+                    r1 += 8;
+                    r2 += 8;
+                }
+                for (; j + 3 < outw; j += 4)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
+                    __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0);
+                    __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0);
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r0n = __lsx_vld(r0 + 4, 0);
+
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
+                    __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0);
+                    __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k00, _r01, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k00, _r02, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k00, _r03, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k01, _r02, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k01, _r03, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k01, _r04, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k02, _r03, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k02, _r04, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k02, _r05, _sum3);
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128i _r1n = __lsx_vld(r1 + 4, 0);
+
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
+                    __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0);
+                    __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k10, _r11, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k10, _r12, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k10, _r13, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k11, _r12, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k11, _r13, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k11, _r14, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k12, _r13, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k12, _r14, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k12, _r15, _sum3);
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128i _r2n = __lsx_vld(r2 + 4, 0);
+
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
+                    __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0);
+                    __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k20, _r21, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k20, _r22, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k20, _r23, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k21, _r22, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k21, _r23, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k21, _r24, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k22, _r23, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k22, _r24, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k22, _r25, _sum3);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+                    __lsx_vst(_sum1, outptr0 + 4, 0);
+                    __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
+                    __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
+
+                    outptr0 += 4 * 4;
+
+                    r0 += 4;
+                    r1 += 4;
+                    r2 += 4;
+                }
+                for (; j + 1 < outw; j += 2)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k00, _r01, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k01, _r02, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k02, _r03, _sum1);
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k10, _r11, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k11, _r12, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k12, _r13, _sum1);
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k20, _r21, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k21, _r22, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k22, _r23, _sum1);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+                    __lsx_vst(_sum1, outptr0 + 4, 0);
+
+                    outptr0 += 4 * 2;
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                }
+                for (; j < outw; j++)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+
+                    outptr0 += 4;
+
+                    r0 += 1;
+                    r1 += 1;
+                    r2 += 1;
+                }
+
+                r0 += 2;
+                r1 += 2;
+                r2 += 2;
+            }
+
+            k0 += 9 * 4;
+        }
+    }
+}
+
+static void conv3x3s2_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2 * outw + w;
+
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out0 = top_blob.channel(p);
+
+        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+        out0.fill(_bias0);
+
+        const float* k0 = kernel.channel(p);
+
+        int q = 0;
+        for (; q < inch; q++)
+        {
+            float* outptr0 = out0;
+
+            const Mat img0 = bottom_blob.channel(q);
+
+            const float* r0 = img0.row(0);
+            const float* r1 = img0.row(1);
+            const float* r2 = img0.row(2);
+
+            __m128 _k00 = (__m128)__lsx_vld(k0, 0);
+            __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
+            __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+            __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+            __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+            __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0);
+            __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0);
+            __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0);
+            __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0);
+
+            int i = 0;
+            for (; i < outh; i++)
+            {
+                int j = 0;
+                for (; j + 7 < outw; j += 8)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
+                    __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0);
+                    __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0);
+                    __m128 _sum4 = (__m128)__lsx_vld(outptr0 + 4 * 4, 0);
+                    __m128 _sum5 = (__m128)__lsx_vld(outptr0 + 4 * 5, 0);
+                    __m128 _sum6 = (__m128)__lsx_vld(outptr0 + 4 * 6, 0);
+                    __m128 _sum7 = (__m128)__lsx_vld(outptr0 + 4 * 7, 0);
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r0n = __lsx_vld(r0 + 4, 0);
+                    __m128i _r0nn = __lsx_vld(r0 + 8, 0);
+                    __m128i _r0nnn = __lsx_vld(r0 + 12, 0);
+
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
+                    __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0);
+                    __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1);
+                    __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2);
+                    __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3);
+                    __m128 _r08 = (__m128)__lsx_vreplvei_w(_r0nn, 0);
+                    __m128 _r09 = (__m128)__lsx_vreplvei_w(_r0nn, 1);
+                    __m128 _r0a = (__m128)__lsx_vreplvei_w(_r0nn, 2);
+                    __m128 _r0b = (__m128)__lsx_vreplvei_w(_r0nn, 3);
+                    __m128 _r0c = (__m128)__lsx_vreplvei_w(_r0nnn, 0);
+                    __m128 _r0d = (__m128)__lsx_vreplvei_w(_r0nnn, 1);
+                    __m128 _r0e = (__m128)__lsx_vreplvei_w(_r0nnn, 2);
+                    __m128 _r0f = (__m128)__lsx_vreplvei_w(_r0nnn, 3);
+                    __m128 _r0g = __lsx_vreplfr2vr_s(r0[16]);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k00, _r04, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k00, _r06, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k00, _r08, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k00, _r0a, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k00, _r0c, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k00, _r0e, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k01, _r05, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k01, _r07, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k01, _r09, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k01, _r0b, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k01, _r0d, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k01, _r0f, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k02, _r06, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k02, _r08, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k02, _r0a, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k02, _r0c, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k02, _r0e, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k02, _r0g, _sum7);
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128i _r1n = __lsx_vld(r1 + 4, 0);
+                    __m128i _r1nn = __lsx_vld(r1 + 8, 0);
+                    __m128i _r1nnn = __lsx_vld(r1 + 12, 0);
+
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
+                    __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0);
+                    __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1);
+                    __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2);
+                    __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3);
+                    __m128 _r18 = (__m128)__lsx_vreplvei_w(_r1nn, 0);
+                    __m128 _r19 = (__m128)__lsx_vreplvei_w(_r1nn, 1);
+                    __m128 _r1a = (__m128)__lsx_vreplvei_w(_r1nn, 2);
+                    __m128 _r1b = (__m128)__lsx_vreplvei_w(_r1nn, 3);
+                    __m128 _r1c = (__m128)__lsx_vreplvei_w(_r1nnn, 0);
+                    __m128 _r1d = (__m128)__lsx_vreplvei_w(_r1nnn, 1);
+                    __m128 _r1e = (__m128)__lsx_vreplvei_w(_r1nnn, 2);
+                    __m128 _r1f = (__m128)__lsx_vreplvei_w(_r1nnn, 3);
+                    __m128 _r1g = __lsx_vreplfr2vr_s(r1[16]);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k10, _r14, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k10, _r16, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k10, _r18, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k10, _r1a, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k10, _r1c, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k10, _r1e, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k11, _r15, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k11, _r17, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k11, _r19, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k11, _r1b, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k11, _r1d, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k11, _r1f, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k12, _r16, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k12, _r18, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k12, _r1a, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k12, _r1c, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k12, _r1e, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k12, _r1g, _sum7);
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128i _r2n = __lsx_vld(r2 + 4, 0);
+                    __m128i _r2nn = __lsx_vld(r2 + 8, 0);
+                    __m128i _r2nnn = __lsx_vld(r2 + 12, 0);
+
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
+                    __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0);
+                    __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1);
+                    __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2);
+                    __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3);
+                    __m128 _r28 = (__m128)__lsx_vreplvei_w(_r2nn, 0);
+                    __m128 _r29 = (__m128)__lsx_vreplvei_w(_r2nn, 1);
+                    __m128 _r2a = (__m128)__lsx_vreplvei_w(_r2nn, 2);
+                    __m128 _r2b = (__m128)__lsx_vreplvei_w(_r2nn, 3);
+                    __m128 _r2c = (__m128)__lsx_vreplvei_w(_r2nnn, 0);
+                    __m128 _r2d = (__m128)__lsx_vreplvei_w(_r2nnn, 1);
+                    __m128 _r2e = (__m128)__lsx_vreplvei_w(_r2nnn, 2);
+                    __m128 _r2f = (__m128)__lsx_vreplvei_w(_r2nnn, 3);
+                    __m128 _r2g = __lsx_vreplfr2vr_s(r2[16]);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k20, _r24, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k20, _r26, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k20, _r28, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k20, _r2a, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k20, _r2c, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k20, _r2e, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k21, _r25, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k21, _r27, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k21, _r29, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k21, _r2b, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k21, _r2d, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k21, _r2f, _sum7);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k22, _r26, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k22, _r28, _sum3);
+                    _sum4 = __lsx_vfmadd_s(_k22, _r2a, _sum4);
+                    _sum5 = __lsx_vfmadd_s(_k22, _r2c, _sum5);
+                    _sum6 = __lsx_vfmadd_s(_k22, _r2e, _sum6);
+                    _sum7 = __lsx_vfmadd_s(_k22, _r2g, _sum7);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+                    __lsx_vst(_sum1, outptr0 + 4, 0);
+                    __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
+                    __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
+                    __lsx_vst(_sum4, outptr0 + 4 * 4, 0);
+                    __lsx_vst(_sum5, outptr0 + 4 * 5, 0);
+                    __lsx_vst(_sum6, outptr0 + 4 * 6, 0);
+                    __lsx_vst(_sum7, outptr0 + 4 * 7, 0);
+
+                    outptr0 += 4 * 8;
+
+                    r0 += 16;
+                    r1 += 16;
+                    r2 += 16;
+                }
+                for (; j + 3 < outw; j += 4)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
+                    __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0);
+                    __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0);
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r0n = __lsx_vld(r0 + 4, 0);
+
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
+                    __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0);
+                    __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1);
+                    __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2);
+                    __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3);
+                    __m128 _r08 = __lsx_vreplfr2vr_s(r0[8]);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k00, _r04, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k00, _r06, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k01, _r05, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k01, _r07, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k02, _r06, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k02, _r08, _sum3);
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128i _r1n = __lsx_vld(r1 + 4, 0);
+
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
+                    __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0);
+                    __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1);
+                    __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2);
+                    __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3);
+                    __m128 _r18 = __lsx_vreplfr2vr_s(r1[8]);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k10, _r14, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k10, _r16, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k11, _r15, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k11, _r17, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k12, _r16, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k12, _r18, _sum3);
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128i _r2n = __lsx_vld(r2 + 4, 0);
+
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
+                    __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0);
+                    __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1);
+                    __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2);
+                    __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3);
+                    __m128 _r28 = __lsx_vreplfr2vr_s(r2[8]);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k20, _r24, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k20, _r26, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k21, _r25, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k21, _r27, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k22, _r26, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k22, _r28, _sum3);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+                    __lsx_vst(_sum1, outptr0 + 4, 0);
+                    __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
+                    __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
+
+                    outptr0 += 4 * 4;
+
+                    r0 += 8;
+                    r1 += 8;
+                    r2 += 8;
+                }
+                for (; j + 1 < outw; j += 2)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
+                    __m128 _r04 = __lsx_vreplfr2vr_s(r0[4]);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1);
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
+                    __m128 _r14 = __lsx_vreplfr2vr_s(r1[4]);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1);
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
+                    __m128 _r24 = __lsx_vreplfr2vr_s(r2[4]);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+                    __lsx_vst(_sum1, outptr0 + 4, 0);
+
+                    outptr0 += 4 * 2;
+
+                    r0 += 4;
+                    r1 += 4;
+                    r2 += 4;
+                }
+                for (; j < outw; j++)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+
+                    outptr0 += 4;
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                }
+
+                r0 += tailstep;
+                r1 += tailstep;
+                r2 += tailstep;
+            }
+
+            k0 += 9 * 4;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_3x3_pack4.h b/src/layer/loongarch/convolution_3x3_pack4.h
new file mode 100644
index 00000000000..f06bb7e9068
--- /dev/null
+++ b/src/layer/loongarch/convolution_3x3_pack4.h
@@ -0,0 +1,425 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd63_transform_kernel_pack4_lsx(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
+{
+    // winograd63 transform kernel
+    Mat kernel_tm;
+    kernel_tm.create(8 * 8, inch, outch);
+
+    const float ktm[8][3] = {
+        {1.0f, 0.0f, 0.0f},
+        {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+        {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+        {1.0f / 90, 1.0f / 45, 2.0f / 45},
+        {1.0f / 90, -1.0f / 45, 2.0f / 45},
+        {1.0f / 45, 1.0f / 90, 1.0f / 180},
+        {1.0f / 45, -1.0f / 90, 1.0f / 180},
+        {0.0f, 0.0f, 1.0f}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
+            float* kernel_tm0 = kernel_tm.channel(p).row(q);
+
+            // transform kernel, transposed
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 3;
+            const float* k2 = kernel0 + 6;
+
+            // h
+            float tmp[8][3];
+            for (int i = 0; i < 8; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // v
+            for (int j = 0; j < 8; j++)
+            {
+                float* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 8; i++)
+                {
+                    kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 64-inch-outch
+    // dst = pb-pa-inch/pa-64-outch/pb
+    kernel_tm_pack4.create(inch / 4, 64, outch / 4, (size_t)4u * 4 * 4, 4 * 4);
+
+    for (int q = 0; q + 3 < outch; q += 4)
+    {
+        Mat g0 = kernel_tm_pack4.channel(q / 4);
+
+        for (int k = 0; k < 64; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p + 3 < inch; p += 4)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 4; j++)
+                    {
+                        const float* k00 = kernel_tm.channel(q + j).row(p + i);
+                        g00[0] = k00[k];
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd63_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 6n+2
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 5) / 6 * 6;
+    outh = (outh + 5) / 6 * 6;
+
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tiles = outw / 6;
+        int h_tiles = outh / 6;
+        const int tiles = w_tiles * h_tiles;
+
+        bottom_blob_tm.create(tiles, 64, inch, elemsize, elempack, opt.workspace_allocator);
+        conv3x3s1_winograd63_transform_input_pack4_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    convolution_winograd_dot_pack4_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
+    }
+    {
+        conv3x3s1_winograd63_transform_output_pack4_lsx(top_blob_tm, top_blob_bordered, bias, opt);
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
+
+static void conv3x3s1_winograd43_transform_kernel_pack4_lsx(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
+{
+    // winograd43 transform kernel
+    Mat kernel_tm(6 * 6, inch, outch);
+
+    const float ktm[6][3] = {
+        {1.0f / 4, 0.0f, 0.0f},
+        {-1.0f / 6, -1.0f / 6, -1.0f / 6},
+        {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+        {1.0f / 24, 1.0f / 12, 1.0f / 6},
+        {1.0f / 24, -1.0f / 12, 1.0f / 6},
+        {0.0f, 0.0f, 1.0f}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
+            float* kernel_tm0 = kernel_tm.channel(p).row(q);
+
+            // transform kernel
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 3;
+            const float* k2 = kernel0 + 6;
+
+            // h
+            float tmp[6][3];
+            for (int i = 0; i < 6; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++)
+            {
+                float* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++)
+                {
+                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 36-inch-outch
+    // dst = pb-pa-inch/pa-36-outch/pb
+    kernel_tm_pack4.create(inch / 4, 36, outch / 4, (size_t)4u * 4 * 4, 4 * 4);
+
+    for (int q = 0; q + 3 < outch; q += 4)
+    {
+        Mat g0 = kernel_tm_pack4.channel(q / 4);
+
+        for (int k = 0; k < 36; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p + 3 < inch; p += 4)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 4; j++)
+                    {
+                        const float* k00 = kernel_tm.channel(q + j).row(p + i);
+                        g00[0] = k00[k];
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 4n+2
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 3) / 4 * 4;
+    outh = (outh + 3) / 4 * 4;
+
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tiles = outw / 4;
+        int h_tiles = outh / 4;
+        const int tiles = w_tiles * h_tiles;
+
+        bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
+        conv3x3s1_winograd43_transform_input_pack4_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    convolution_winograd_dot_pack4_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
+    }
+    {
+        conv3x3s1_winograd43_transform_output_pack4_lsx(top_blob_tm, top_blob_bordered, bias, opt);
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
+
+static void conv3x3s1_winograd23_transform_kernel_pack4_lsx(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
+{
+    // winograd23 transform kernel
+    Mat kernel_tm(4 * 4, inch, outch);
+
+    const float ktm[4][3] = {
+        {1.0f, 0.0f, 0.0f},
+        {1.0f / 2, 1.0f / 2, 1.0f / 2},
+        {1.0f / 2, -1.0f / 2, 1.0f / 2},
+        {0.0f, 0.0f, 1.0f}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
+            float* kernel_tm0 = kernel_tm.channel(p).row(q);
+
+            // transform kernel
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 3;
+            const float* k2 = kernel0 + 6;
+
+            // h
+            float tmp[4][3];
+            for (int i = 0; i < 4; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 4; j++)
+            {
+                float* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 4; i++)
+                {
+                    kernel_tm0[j * 4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 16-inch-outch
+    // dst = pb-pa-inch/pa-16-outch/pb
+    kernel_tm_pack4.create(inch / 4, 16, outch / 4, (size_t)4u * 4 * 4, 4 * 4);
+
+    for (int q = 0; q + 3 < outch; q += 4)
+    {
+        Mat g0 = kernel_tm_pack4.channel(q / 4);
+
+        for (int k = 0; k < 16; k++)
+        {
+            float* g00 = g0.row(k);
+
+            for (int p = 0; p + 3 < inch; p += 4)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 4; j++)
+                    {
+                        const float* k00 = kernel_tm.channel(q + j).row(p + i);
+                        g00[0] = k00[k];
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 2n+2
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 1) / 2 * 2;
+    outh = (outh + 1) / 2 * 2;
+
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tiles = outw / 2;
+        int h_tiles = outh / 2;
+        const int tiles = w_tiles * h_tiles;
+
+        bottom_blob_tm.create(tiles, 16, inch, elemsize, elempack, opt.workspace_allocator);
+        conv3x3s1_winograd23_transform_input_pack4_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    convolution_winograd_dot_pack4_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
+    }
+    {
+        conv3x3s1_winograd23_transform_output_pack4_lsx(top_blob_tm, top_blob_bordered, bias, opt);
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
diff --git a/src/layer/loongarch/convolution_3x3_pack8to1_int8.h b/src/layer/loongarch/convolution_3x3_pack8to1_int8.h
new file mode 100644
index 00000000000..3c4f9718753
--- /dev/null
+++ b/src/layer/loongarch/convolution_3x3_pack8to1_int8.h
@@ -0,0 +1,177 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_lsx(const Mat& kernel, Mat& kernel_tm_pack8to1, int inch, int outch, const Option& opt)
+{
+    // winograd43 transform kernel
+    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);
+
+    const short ktm[6][3] = {
+        {6, 0, 0},
+        {-4, -4, -4},
+        {-4, 4, -4},
+        {1, 2, 4},
+        {1, -2, 4},
+        {0, 0, 6}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
+            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
+
+            // transform kernel
+            const signed char* k0 = kernel0;
+            const signed char* k1 = kernel0 + 3;
+            const signed char* k2 = kernel0 + 6;
+
+            // h
+            short tmp[6][3];
+            for (int i = 0; i < 6; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++)
+            {
+                short* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++)
+                {
+                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 36-inch-outch
+    // dst = 4b-8a-inch/8a-36-outch/4b
+    kernel_tm_pack8to1.create(8 * inch / 8, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4);
+
+    int p = 0;
+    for (; p + 3 < outch; p += 4)
+    {
+        const Mat k0 = kernel_tm.channel(p);
+        const Mat k1 = kernel_tm.channel(p + 1);
+        const Mat k2 = kernel_tm.channel(p + 2);
+        const Mat k3 = kernel_tm.channel(p + 3);
+
+        Mat g0 = kernel_tm_pack8to1.channel(p / 4);
+
+        for (int k = 0; k < 36; k++)
+        {
+            short* g00 = g0.row<short>(k);
+
+            for (int q = 0; q + 7 < inch; q += 8)
+            {
+                for (int i = 0; i < 8; i++)
+                {
+                    g00[0] = k0.row<const short>(q + i)[k];
+                    g00[1] = k1.row<const short>(q + i)[k];
+                    g00[2] = k2.row<const short>(q + i)[k];
+                    g00[3] = k3.row<const short>(q + i)[k];
+
+                    g00 += 4;
+                }
+            }
+        }
+    }
+    for (; p < outch; p++)
+    {
+        const Mat k0 = kernel_tm.channel(p);
+
+        Mat g0 = kernel_tm_pack8to1.channel(p / 4 + p % 4);
+
+        for (int k = 0; k < 36; k++)
+        {
+            short* g00 = g0.row<short>(k);
+
+            for (int q = 0; q + 7 < inch; q += 8)
+            {
+                for (int i = 0; i < 8; i++)
+                {
+                    g00[0] = k0.row<const short>(q + i)[k];
+
+                    g00 += 1;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+    //     size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 4n+2
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 3) / 4 * 4;
+    outh = (outh + 3) / 4 * 4;
+
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tiles = outw / 4;
+        int h_tiles = outh / 4;
+        const int tiles = w_tiles * h_tiles;
+
+        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);
+        conv3x3s1_winograd43_transform_input_pack8_int8_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    convolution_winograd_dot_pack8to1_int8_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator);
+    }
+    {
+        conv3x3s1_winograd43_transform_output_int8_lsx(top_blob_tm, top_blob_bordered, opt);
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
diff --git a/src/layer/loongarch/convolution_3x3_pack8to4_int8.h b/src/layer/loongarch/convolution_3x3_pack8to4_int8.h
new file mode 100644
index 00000000000..bf328cee73f
--- /dev/null
+++ b/src/layer/loongarch/convolution_3x3_pack8to4_int8.h
@@ -0,0 +1,161 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_lsx(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt)
+{
+    // winograd43 transform kernel
+    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);
+
+    const short ktm[6][3] = {
+        {6, 0, 0},
+        {-4, -4, -4},
+        {-4, 4, -4},
+        {1, 2, 4},
+        {1, -2, 4},
+        {0, 0, 6}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
+            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
+
+            // transform kernel
+            const signed char* k0 = kernel0;
+            const signed char* k1 = kernel0 + 3;
+            const signed char* k2 = kernel0 + 6;
+
+            // h
+            short tmp[6][3];
+            for (int i = 0; i < 6; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++)
+            {
+                short* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++)
+                {
+                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 36-inch-outch
+    // dst = 4b-8a-inch/8a-36-outch/4b
+    kernel_tm_pack8.create(inch / 8, 36, outch / 4, (size_t)2u * 32, 32);
+
+    int q = 0;
+    for (; q + 3 < outch; q += 4)
+    {
+        const Mat k0 = kernel_tm.channel(q);
+        const Mat k1 = kernel_tm.channel(q + 1);
+        const Mat k2 = kernel_tm.channel(q + 2);
+        const Mat k3 = kernel_tm.channel(q + 3);
+
+        Mat kernel_tm = kernel_tm_pack8.channel(q / 4);
+
+        for (int k = 0; k < 36; k++)
+        {
+            short* g00 = kernel_tm.row<short>(k);
+
+            for (int p = 0; p + 7 < inch; p += 8)
+            {
+                for (int i = 0; i < 8; i++)
+                {
+                    const short* k00 = k0.row<const short>(p + i);
+                    const short* k10 = k1.row<const short>(p + i);
+                    const short* k20 = k2.row<const short>(p + i);
+                    const short* k30 = k3.row<const short>(p + i);
+
+                    g00[0] = k00[k];
+                    g00[1] = k10[k];
+                    g00[2] = k20[k];
+                    g00[3] = k30[k];
+
+                    g00 += 4;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+    //     size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 4n+2
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 3) / 4 * 4;
+    outh = (outh + 3) / 4 * 4;
+
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tiles = outw / 4;
+        int h_tiles = outh / 4;
+        const int tiles = w_tiles * h_tiles;
+
+        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);
+        conv3x3s1_winograd43_transform_input_pack8_int8_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    convolution_winograd_dot_pack8to4_int8_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, 4u * 4, 4, opt.workspace_allocator);
+    }
+    {
+        conv3x3s1_winograd43_transform_output_pack4_int8_lsx(top_blob_tm, top_blob_bordered, opt);
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
diff --git a/src/layer/loongarch/convolution_7x7_pack1to4.h b/src/layer/loongarch/convolution_7x7_pack1to4.h
new file mode 100644
index 00000000000..f57923b53d0
--- /dev/null
+++ b/src/layer/loongarch/convolution_7x7_pack1to4.h
@@ -0,0 +1,652 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv7x7s2_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2 * outw + w;
+
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out0 = top_blob.channel(p);
+
+        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+        out0.fill(_bias0);
+
+        for (int q = 0; q < inch; q++)
+        {
+            float* outptr0 = out0;
+
+            const Mat img0 = bottom_blob.channel(q);
+
+            const float* r0 = img0.row(0);
+            const float* r1 = img0.row(1);
+            const float* r2 = img0.row(2);
+            const float* r3 = img0.row(3);
+            const float* r4 = img0.row(4);
+            const float* r5 = img0.row(5);
+            const float* r6 = img0.row(6);
+
+            const float* kptr = kernel.channel(p).row(q);
+
+            int i = 0;
+
+            for (; i < outh; i++)
+            {
+                int j = 0;
+                for (; j + 3 < outw; j += 4)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
+                    __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0);
+                    __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0);
+
+                    __m128 _k00 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k01 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k02 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k03 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k04 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k05 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k06 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r0n = __lsx_vld(r0 + 4, 0);
+                    __m128i _r0nn = __lsx_vld(r0 + 8, 0);
+
+                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
+                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
+                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
+                    __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0);
+                    __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1);
+                    __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2);
+                    __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3);
+                    __m128 _r08 = (__m128)__lsx_vreplvei_w(_r0nn, 0);
+                    __m128 _r09 = (__m128)__lsx_vreplvei_w(_r0nn, 1);
+                    __m128 _r0a = (__m128)__lsx_vreplvei_w(_r0nn, 2);
+                    __m128 _r0b = (__m128)__lsx_vreplvei_w(_r0nn, 3);
+                    __m128 _r0c = __lsx_vreplfr2vr_s(r0[12]);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k00, _r04, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k00, _r06, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k01, _r05, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k01, _r07, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k02, _r06, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k02, _r08, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k03, _r03, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k03, _r05, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k03, _r07, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k03, _r09, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k04, _r04, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k04, _r06, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k04, _r08, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k04, _r0a, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k05, _r05, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k05, _r07, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k05, _r09, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k05, _r0b, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k06, _r06, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k06, _r08, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k06, _r0a, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k06, _r0c, _sum3);
+
+                    __m128 _k10 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k11 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k12 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k13 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k14 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k15 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k16 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128i _r1n = __lsx_vld(r1 + 4, 0);
+                    __m128i _r1nn = __lsx_vld(r1 + 8, 0);
+
+                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
+                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
+                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
+                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
+                    __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0);
+                    __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1);
+                    __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2);
+                    __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3);
+                    __m128 _r18 = (__m128)__lsx_vreplvei_w(_r1nn, 0);
+                    __m128 _r19 = (__m128)__lsx_vreplvei_w(_r1nn, 1);
+                    __m128 _r1a = (__m128)__lsx_vreplvei_w(_r1nn, 2);
+                    __m128 _r1b = (__m128)__lsx_vreplvei_w(_r1nn, 3);
+                    __m128 _r1c = __lsx_vreplfr2vr_s(r1[12]);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k10, _r14, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k10, _r16, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k11, _r15, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k11, _r17, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k12, _r16, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k12, _r18, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k13, _r13, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k13, _r15, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k13, _r17, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k13, _r19, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k14, _r14, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k14, _r16, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k14, _r18, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k14, _r1a, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k15, _r15, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k15, _r17, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k15, _r19, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k15, _r1b, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k16, _r16, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k16, _r18, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k16, _r1a, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k16, _r1c, _sum3);
+
+                    __m128 _k20 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k21 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k22 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k23 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k24 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k25 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k26 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128i _r2n = __lsx_vld(r2 + 4, 0);
+                    __m128i _r2nn = __lsx_vld(r2 + 8, 0);
+
+                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
+                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
+                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
+                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
+                    __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0);
+                    __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1);
+                    __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2);
+                    __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3);
+                    __m128 _r28 = (__m128)__lsx_vreplvei_w(_r2nn, 0);
+                    __m128 _r29 = (__m128)__lsx_vreplvei_w(_r2nn, 1);
+                    __m128 _r2a = (__m128)__lsx_vreplvei_w(_r2nn, 2);
+                    __m128 _r2b = (__m128)__lsx_vreplvei_w(_r2nn, 3);
+                    __m128 _r2c = __lsx_vreplfr2vr_s(r2[12]);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k20, _r24, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k20, _r26, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k21, _r25, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k21, _r27, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k22, _r26, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k22, _r28, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k23, _r23, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k23, _r25, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k23, _r27, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k23, _r29, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k24, _r24, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k24, _r26, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k24, _r28, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k24, _r2a, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k25, _r25, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k25, _r27, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k25, _r29, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k25, _r2b, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k26, _r26, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k26, _r28, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k26, _r2a, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k26, _r2c, _sum3);
+
+                    __m128 _k30 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k31 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k32 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k33 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k34 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k35 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k36 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r3 = __lsx_vld(r3, 0);
+                    __m128i _r3n = __lsx_vld(r3 + 4, 0);
+                    __m128i _r3nn = __lsx_vld(r3 + 8, 0);
+
+                    __m128 _r30 = (__m128)__lsx_vreplvei_w(_r3, 0);
+                    __m128 _r31 = (__m128)__lsx_vreplvei_w(_r3, 1);
+                    __m128 _r32 = (__m128)__lsx_vreplvei_w(_r3, 2);
+                    __m128 _r33 = (__m128)__lsx_vreplvei_w(_r3, 3);
+                    __m128 _r34 = (__m128)__lsx_vreplvei_w(_r3n, 0);
+                    __m128 _r35 = (__m128)__lsx_vreplvei_w(_r3n, 1);
+                    __m128 _r36 = (__m128)__lsx_vreplvei_w(_r3n, 2);
+                    __m128 _r37 = (__m128)__lsx_vreplvei_w(_r3n, 3);
+                    __m128 _r38 = (__m128)__lsx_vreplvei_w(_r3nn, 0);
+                    __m128 _r39 = (__m128)__lsx_vreplvei_w(_r3nn, 1);
+                    __m128 _r3a = (__m128)__lsx_vreplvei_w(_r3nn, 2);
+                    __m128 _r3b = (__m128)__lsx_vreplvei_w(_r3nn, 3);
+                    __m128 _r3c = __lsx_vreplfr2vr_s(r3[12]);
+
+                    _sum0 = __lsx_vfmadd_s(_k30, _r30, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k30, _r32, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k30, _r34, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k30, _r36, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k31, _r31, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k31, _r33, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k31, _r35, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k31, _r37, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k32, _r32, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k32, _r34, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k32, _r36, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k32, _r38, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k33, _r33, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k33, _r35, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k33, _r37, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k33, _r39, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k34, _r34, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k34, _r36, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k34, _r38, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k34, _r3a, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k35, _r35, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k35, _r37, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k35, _r39, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k35, _r3b, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k36, _r36, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k36, _r38, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k36, _r3a, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k36, _r3c, _sum3);
+
+                    __m128 _k40 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k41 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k42 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k43 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k44 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k45 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k46 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r4 = __lsx_vld(r4, 0);
+                    __m128i _r4n = __lsx_vld(r4 + 4, 0);
+                    __m128i _r4nn = __lsx_vld(r4 + 8, 0);
+
+                    __m128 _r40 = (__m128)__lsx_vreplvei_w(_r4, 0);
+                    __m128 _r41 = (__m128)__lsx_vreplvei_w(_r4, 1);
+                    __m128 _r42 = (__m128)__lsx_vreplvei_w(_r4, 2);
+                    __m128 _r43 = (__m128)__lsx_vreplvei_w(_r4, 3);
+                    __m128 _r44 = (__m128)__lsx_vreplvei_w(_r4n, 0);
+                    __m128 _r45 = (__m128)__lsx_vreplvei_w(_r4n, 1);
+                    __m128 _r46 = (__m128)__lsx_vreplvei_w(_r4n, 2);
+                    __m128 _r47 = (__m128)__lsx_vreplvei_w(_r4n, 3);
+                    __m128 _r48 = (__m128)__lsx_vreplvei_w(_r4nn, 0);
+                    __m128 _r49 = (__m128)__lsx_vreplvei_w(_r4nn, 1);
+                    __m128 _r4a = (__m128)__lsx_vreplvei_w(_r4nn, 2);
+                    __m128 _r4b = (__m128)__lsx_vreplvei_w(_r4nn, 3);
+                    __m128 _r4c = __lsx_vreplfr2vr_s(r4[12]);
+
+                    _sum0 = __lsx_vfmadd_s(_k40, _r40, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k40, _r42, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k40, _r44, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k40, _r46, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k41, _r41, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k41, _r43, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k41, _r45, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k41, _r47, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k42, _r42, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k42, _r44, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k42, _r46, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k42, _r48, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k43, _r43, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k43, _r45, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k43, _r47, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k43, _r49, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k44, _r44, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k44, _r46, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k44, _r48, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k44, _r4a, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k45, _r45, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k45, _r47, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k45, _r49, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k45, _r4b, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k46, _r46, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k46, _r48, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k46, _r4a, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k46, _r4c, _sum3);
+
+                    __m128 _k50 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k51 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k52 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k53 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k54 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k55 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k56 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r5 = __lsx_vld(r5, 0);
+                    __m128i _r5n = __lsx_vld(r5 + 4, 0);
+                    __m128i _r5nn = __lsx_vld(r5 + 8, 0);
+
+                    __m128 _r50 = (__m128)__lsx_vreplvei_w(_r5, 0);
+                    __m128 _r51 = (__m128)__lsx_vreplvei_w(_r5, 1);
+                    __m128 _r52 = (__m128)__lsx_vreplvei_w(_r5, 2);
+                    __m128 _r53 = (__m128)__lsx_vreplvei_w(_r5, 3);
+                    __m128 _r54 = (__m128)__lsx_vreplvei_w(_r5n, 0);
+                    __m128 _r55 = (__m128)__lsx_vreplvei_w(_r5n, 1);
+                    __m128 _r56 = (__m128)__lsx_vreplvei_w(_r5n, 2);
+                    __m128 _r57 = (__m128)__lsx_vreplvei_w(_r5n, 3);
+                    __m128 _r58 = (__m128)__lsx_vreplvei_w(_r5nn, 0);
+                    __m128 _r59 = (__m128)__lsx_vreplvei_w(_r5nn, 1);
+                    __m128 _r5a = (__m128)__lsx_vreplvei_w(_r5nn, 2);
+                    __m128 _r5b = (__m128)__lsx_vreplvei_w(_r5nn, 3);
+                    __m128 _r5c = __lsx_vreplfr2vr_s(r5[12]);
+
+                    _sum0 = __lsx_vfmadd_s(_k50, _r50, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k50, _r52, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k50, _r54, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k50, _r56, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k51, _r51, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k51, _r53, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k51, _r55, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k51, _r57, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k52, _r52, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k52, _r54, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k52, _r56, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k52, _r58, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k53, _r53, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k53, _r55, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k53, _r57, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k53, _r59, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k54, _r54, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k54, _r56, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k54, _r58, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k54, _r5a, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k55, _r55, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k55, _r57, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k55, _r59, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k55, _r5b, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k56, _r56, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k56, _r58, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k56, _r5a, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k56, _r5c, _sum3);
+
+                    __m128 _k60 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k61 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k62 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k63 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k64 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k65 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k66 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr -= 4 * 42;
+
+                    __m128i _r6 = __lsx_vld(r6, 0);
+                    __m128i _r6n = __lsx_vld(r6 + 4, 0);
+                    __m128i _r6nn = __lsx_vld(r6 + 8, 0);
+
+                    __m128 _r60 = (__m128)__lsx_vreplvei_w(_r6, 0);
+                    __m128 _r61 = (__m128)__lsx_vreplvei_w(_r6, 1);
+                    __m128 _r62 = (__m128)__lsx_vreplvei_w(_r6, 2);
+                    __m128 _r63 = (__m128)__lsx_vreplvei_w(_r6, 3);
+                    __m128 _r64 = (__m128)__lsx_vreplvei_w(_r6n, 0);
+                    __m128 _r65 = (__m128)__lsx_vreplvei_w(_r6n, 1);
+                    __m128 _r66 = (__m128)__lsx_vreplvei_w(_r6n, 2);
+                    __m128 _r67 = (__m128)__lsx_vreplvei_w(_r6n, 3);
+                    __m128 _r68 = (__m128)__lsx_vreplvei_w(_r6nn, 0);
+                    __m128 _r69 = (__m128)__lsx_vreplvei_w(_r6nn, 1);
+                    __m128 _r6a = (__m128)__lsx_vreplvei_w(_r6nn, 2);
+                    __m128 _r6b = (__m128)__lsx_vreplvei_w(_r6nn, 3);
+                    __m128 _r6c = __lsx_vreplfr2vr_s(r6[12]);
+
+                    _sum0 = __lsx_vfmadd_s(_k60, _r60, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k60, _r62, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k60, _r64, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k60, _r66, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k61, _r61, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k61, _r63, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k61, _r65, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k61, _r67, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k62, _r62, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k62, _r64, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k62, _r66, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k62, _r68, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k63, _r63, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k63, _r65, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k63, _r67, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k63, _r69, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k64, _r64, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k64, _r66, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k64, _r68, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k64, _r6a, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k65, _r65, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k65, _r67, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k65, _r69, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k65, _r6b, _sum3);
+                    _sum0 = __lsx_vfmadd_s(_k66, _r66, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_k66, _r68, _sum1);
+                    _sum2 = __lsx_vfmadd_s(_k66, _r6a, _sum2);
+                    _sum3 = __lsx_vfmadd_s(_k66, _r6c, _sum3);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+                    __lsx_vst(_sum1, outptr0 + 4, 0);
+                    __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
+                    __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
+
+                    outptr0 += 4 * 4;
+
+                    r0 += 8;
+                    r1 += 8;
+                    r2 += 8;
+                    r3 += 8;
+                    r4 += 8;
+                    r5 += 8;
+                    r6 += 8;
+                }
+                for (; j < outw; j++)
+                {
+                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
+
+                    __m128 _k00 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k01 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k02 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k03 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k04 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k05 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k06 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r0n = __lsx_vld(r0 + 4, 0);
+
+                    _sum0 = __lsx_vfmadd_s(_k00, (__m128)__lsx_vreplvei_w(_r0, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k01, (__m128)__lsx_vreplvei_w(_r0, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k02, (__m128)__lsx_vreplvei_w(_r0, 2), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k03, (__m128)__lsx_vreplvei_w(_r0, 3), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k04, (__m128)__lsx_vreplvei_w(_r0n, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k05, (__m128)__lsx_vreplvei_w(_r0n, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k06, (__m128)__lsx_vreplvei_w(_r0n, 2), _sum0);
+
+                    __m128 _k10 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k11 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k12 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k13 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k14 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k15 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k16 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128i _r1n = __lsx_vld(r1 + 4, 0);
+
+                    _sum0 = __lsx_vfmadd_s(_k10, (__m128)__lsx_vreplvei_w(_r1, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k11, (__m128)__lsx_vreplvei_w(_r1, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k12, (__m128)__lsx_vreplvei_w(_r1, 2), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k13, (__m128)__lsx_vreplvei_w(_r1, 3), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k14, (__m128)__lsx_vreplvei_w(_r1n, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k15, (__m128)__lsx_vreplvei_w(_r1n, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k16, (__m128)__lsx_vreplvei_w(_r1n, 2), _sum0);
+
+                    __m128 _k20 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k21 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k22 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k23 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k24 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k25 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k26 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128i _r2n = __lsx_vld(r2 + 4, 0);
+
+                    _sum0 = __lsx_vfmadd_s(_k20, (__m128)__lsx_vreplvei_w(_r2, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k21, (__m128)__lsx_vreplvei_w(_r2, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k22, (__m128)__lsx_vreplvei_w(_r2, 2), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k23, (__m128)__lsx_vreplvei_w(_r2, 3), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k24, (__m128)__lsx_vreplvei_w(_r2n, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k25, (__m128)__lsx_vreplvei_w(_r2n, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k26, (__m128)__lsx_vreplvei_w(_r2n, 2), _sum0);
+
+                    __m128 _k30 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k31 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k32 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k33 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k34 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k35 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k36 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r3 = __lsx_vld(r3, 0);
+                    __m128i _r3n = __lsx_vld(r3 + 4, 0);
+
+                    _sum0 = __lsx_vfmadd_s(_k30, (__m128)__lsx_vreplvei_w(_r3, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k31, (__m128)__lsx_vreplvei_w(_r3, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k32, (__m128)__lsx_vreplvei_w(_r3, 2), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k33, (__m128)__lsx_vreplvei_w(_r3, 3), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k34, (__m128)__lsx_vreplvei_w(_r3n, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k35, (__m128)__lsx_vreplvei_w(_r3n, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k36, (__m128)__lsx_vreplvei_w(_r3n, 2), _sum0);
+
+                    __m128 _k40 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k41 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k42 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k43 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k44 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k45 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k46 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r4 = __lsx_vld(r4, 0);
+                    __m128i _r4n = __lsx_vld(r4 + 4, 0);
+
+                    _sum0 = __lsx_vfmadd_s(_k40, (__m128)__lsx_vreplvei_w(_r4, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k41, (__m128)__lsx_vreplvei_w(_r4, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k42, (__m128)__lsx_vreplvei_w(_r4, 2), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k43, (__m128)__lsx_vreplvei_w(_r4, 3), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k44, (__m128)__lsx_vreplvei_w(_r4n, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k45, (__m128)__lsx_vreplvei_w(_r4n, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k46, (__m128)__lsx_vreplvei_w(_r4n, 2), _sum0);
+
+                    __m128 _k50 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k51 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k52 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k53 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k54 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k55 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k56 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr += 4 * 7;
+
+                    __m128i _r5 = __lsx_vld(r5, 0);
+                    __m128i _r5n = __lsx_vld(r5 + 4, 0);
+
+                    _sum0 = __lsx_vfmadd_s(_k50, (__m128)__lsx_vreplvei_w(_r5, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k51, (__m128)__lsx_vreplvei_w(_r5, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k52, (__m128)__lsx_vreplvei_w(_r5, 2), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k53, (__m128)__lsx_vreplvei_w(_r5, 3), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k54, (__m128)__lsx_vreplvei_w(_r5n, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k55, (__m128)__lsx_vreplvei_w(_r5n, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k56, (__m128)__lsx_vreplvei_w(_r5n, 2), _sum0);
+
+                    __m128 _k60 = (__m128)__lsx_vld(kptr, 0);
+                    __m128 _k61 = (__m128)__lsx_vld(kptr + 4, 0);
+                    __m128 _k62 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
+                    __m128 _k63 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
+                    __m128 _k64 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
+                    __m128 _k65 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
+                    __m128 _k66 = (__m128)__lsx_vld(kptr + 4 * 6, 0);
+
+                    kptr -= 4 * 42;
+
+                    __m128i _r6 = __lsx_vld(r6, 0);
+                    __m128i _r6n = __lsx_vld(r6 + 4, 0);
+
+                    _sum0 = __lsx_vfmadd_s(_k60, (__m128)__lsx_vreplvei_w(_r6, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k61, (__m128)__lsx_vreplvei_w(_r6, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k62, (__m128)__lsx_vreplvei_w(_r6, 2), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k63, (__m128)__lsx_vreplvei_w(_r6, 3), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k64, (__m128)__lsx_vreplvei_w(_r6n, 0), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k65, (__m128)__lsx_vreplvei_w(_r6n, 1), _sum0);
+                    _sum0 = __lsx_vfmadd_s(_k66, (__m128)__lsx_vreplvei_w(_r6n, 2), _sum0);
+
+                    __lsx_vst(_sum0, outptr0, 0);
+
+                    outptr0 += 4;
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                    r3 += 2;
+                    r4 += 2;
+                    r5 += 2;
+                    r6 += 2;
+                }
+
+                r0 += tailstep;
+                r1 += tailstep;
+                r2 += tailstep;
+                r3 += tailstep;
+                r4 += tailstep;
+                r5 += tailstep;
+                r6 += tailstep;
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_int8.h b/src/layer/loongarch/convolution_int8.h
new file mode 100644
index 00000000000..22c7a8ccbe6
--- /dev/null
+++ b/src/layer/loongarch/convolution_int8.h
@@ -0,0 +1,82 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                int sum = 0;
+
+                //                 const signed char* kptr = weight_data_int8.channel(p);
+                const signed char* kptr = (const signed char*)weight_data_int8 + maxk * channels * p;
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        signed char val = sptr[space_ofs[k]];
+                        signed char w = kptr[k];
+                        sum += val * w;
+                    }
+
+                    kptr += maxk;
+                }
+
+                outptr[j] = sum;
+            }
+
+            outptr += outw;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_loongarch.cpp b/src/layer/loongarch/convolution_loongarch.cpp
new file mode 100644
index 00000000000..31719b3de92
--- /dev/null
+++ b/src/layer/loongarch/convolution_loongarch.cpp
@@ -0,0 +1,975 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution_loongarch.h"
+
+#include "benchmark.h"
+#include "cpu.h"
+#include "layer_type.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_activation.h"
+#include "loongarch_usability.h"
+
+#include "cpu.h"
+
+namespace ncnn {
+
+#include "convolution_sgemm.h"
+#include "convolution_winograd_transform.h"
+#include "convolution_winograd_dot.h"
+#include "convolution_1x1.h"
+#include "convolution_3x3.h"
+
+#if NCNN_INT8
+#include "convolution_sgemm_int8.h"
+#include "convolution_winograd_transform_int8.h"
+#include "convolution_winograd_dot_int8.h"
+#include "convolution_1x1_int8.h"
+#include "convolution_3x3_int8.h"
+#include "convolution_int8.h"
+#endif // NCNN_INT8
+
+#if __loongarch_sx
+#include "convolution_pack4.h"
+#include "convolution_pack1to4.h"
+#include "convolution_pack4to1.h"
+
+#include "convolution_sgemm_pack4.h"
+#include "convolution_sgemm_pack4to1.h"
+#include "convolution_winograd_transform_pack4.h"
+#include "convolution_winograd_dot_pack4.h"
+#include "convolution_1x1_pack4.h"
+#include "convolution_1x1_pack4to1.h"
+#include "convolution_3x3_pack4.h"
+#include "convolution_3x3_pack1to4.h"
+#include "convolution_7x7_pack1to4.h"
+
+#if NCNN_INT8
+#include "convolution_pack8to4_int8.h"
+#include "convolution_pack1to4_int8.h"
+#include "convolution_pack8to1_int8.h"
+#include "convolution_sgemm_pack8to4_int8.h"
+#include "convolution_sgemm_pack1to4_int8.h"
+#include "convolution_sgemm_pack8to1_int8.h"
+#include "convolution_winograd_transform_pack4_int8.h"
+#include "convolution_winograd_transform_pack8_int8.h"
+#include "convolution_winograd_dot_pack8to4_int8.h"
+#include "convolution_winograd_dot_pack8to1_int8.h"
+#include "convolution_1x1_pack8to4_int8.h"
+#include "convolution_1x1_pack1to4_int8.h"
+#include "convolution_1x1_pack8to1_int8.h"
+#include "convolution_3x3_pack8to4_int8.h"
+#include "convolution_3x3_pack8to1_int8.h"
+#endif // NCNN_INT8
+#endif // __loongarch_sx
+
+Convolution_loongarch::Convolution_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+
+    activation = 0;
+}
+
+static void convolution_transform_kernel_packed_lsx(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // src = kw-kh-inch-outch
+    // dst = pb-pa-kw-kh-inch/pa-outch/pb
+    {
+        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
+
+        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            float* g00 = weight_data_tm.channel(q / out_elempack);
+
+            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
+            {
+                for (int k = 0; k < maxk; k++)
+                {
+                    for (int i = 0; i < elempack; i++)
+                    {
+                        for (int j = 0; j < out_elempack; j++)
+                        {
+                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);
+
+                            g00[0] = k00[k];
+
+                            g00++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+int Convolution_loongarch::create_pipeline(const Option& opt)
+{
+    if (dynamic_weight)
+        return 0;
+
+    activation = create_activation_layer(activation_type, activation_params, opt);
+
+#if NCNN_INT8
+    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
+    {
+        return create_pipeline_int8_loongarch(opt);
+    }
+#endif
+
+    const int maxk = kernel_w * kernel_h;
+    const int num_input = weight_data_size / maxk / num_output;
+
+    int elempack = 1;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        elempack = num_input % 4 == 0 ? 4 : 1;
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif
+
+#if __loongarch_sx
+    // pack4
+    if (elempack == 4 && out_elempack == 4)
+    {
+        if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            if ((opt.use_winograd63_convolution && num_input >= 8 && num_output >= 8 && num_input <= 64 && num_output <= 64) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
+                conv3x3s1_winograd63_transform_kernel_pack4_lsx(weight_data, weight_winograd63_data, num_input, num_output, opt);
+            else if ((opt.use_winograd43_convolution && num_input >= 8 && num_output >= 8) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
+                conv3x3s1_winograd43_transform_kernel_pack4_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt);
+            else // if (opt.use_winograd23_convolution)
+                conv3x3s1_winograd23_transform_kernel_pack4_lsx(weight_data, weight_winograd23_data, num_input, num_output, opt);
+        }
+        else
+        {
+            convolution_transform_kernel_packed_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    // pack1ton
+    if (elempack == 1 && out_elempack == 4)
+    {
+        convolution_transform_kernel_packed_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+    }
+
+    // pack4to1
+    if (elempack == 4 && out_elempack == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            convolution_transform_kernel_packed_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+#endif // __loongarch_sx
+
+    // pack1
+    if (elempack == 1 && out_elempack == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            convolution_im2col_sgemm_transform_kernel_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || !opt.use_winograd23_convolution)
+            {
+                conv3x3s1_winograd43_transform_kernel_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt);
+            }
+            else if (opt.use_winograd23_convolution)
+            {
+                conv3x3s1_winograd23_transform_kernel_lsx(weight_data, weight_winograd23_data, num_input, num_output, opt);
+            }
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            weight_data_tm = weight_data;
+        }
+    }
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int Convolution_loongarch::destroy_pipeline(const Option& opt)
+{
+    if (activation)
+    {
+        activation->destroy_pipeline(opt);
+        delete activation;
+        activation = 0;
+    }
+
+    return 0;
+}
+
+int Convolution_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+#if NCNN_INT8
+    if (opt.use_int8_inference && int8_scale_term)
+    {
+        return forward_int8_loongarch(bottom_blob, top_blob, opt);
+    }
+#endif
+
+    // flattened blob, implement as InnerProduct
+    if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
+    {
+        Mat bottom_blob_3d;
+        if (bottom_blob.elemsize % 16 == 0)
+        {
+            bottom_blob_3d = bottom_blob;
+            bottom_blob_3d.dims = 3;
+            bottom_blob_3d.w = 1;
+            bottom_blob_3d.h = 1;
+            bottom_blob_3d.c = bottom_blob.w;
+            bottom_blob_3d.cstep = 1;
+        }
+        else
+        {
+            bottom_blob_3d = bottom_blob.reshape(1, 1, bottom_blob.w, opt.workspace_allocator);
+        }
+
+        Mat top_blob_3d;
+        int ret = forward(bottom_blob_3d, top_blob_3d, opt);
+        if (ret != 0)
+            return ret;
+
+        if (top_blob_3d.elemsize % 16 == 0)
+        {
+            top_blob = top_blob_3d;
+            top_blob.dims = 1;
+            top_blob.w = top_blob_3d.c;
+            top_blob.h = 1;
+            top_blob.c = 1;
+            bottom_blob_3d.cstep = top_blob_3d.c;
+        }
+        else
+        {
+            top_blob = top_blob_3d.reshape(top_blob_3d.c, opt.blob_allocator);
+        }
+
+        return 0;
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    //     NCNN_LOGE("Convolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    w = bottom_blob_bordered.w;
+    h = bottom_blob_bordered.h;
+
+    int outw = (w - kernel_extent_w) / stride_w + 1;
+    int outh = (h - kernel_extent_h) / stride_h + 1;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    const int num_input = channels * elempack;
+
+#if __loongarch_sx
+    if (elempack == 4 && out_elempack == 4)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv1x1s2_sgemm_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            if ((opt.use_winograd63_convolution && num_input >= 8 && num_output >= 8 && num_input <= 64 && num_output <= 64) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
+                conv3x3s1_winograd63_pack4_lsx(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, opt);
+            else if ((opt.use_winograd43_convolution && num_input >= 8 && num_output >= 8) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
+                conv3x3s1_winograd43_pack4_lsx(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt);
+            else // if (opt.use_winograd23_convolution)
+                conv3x3s1_winograd23_pack4_lsx(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            convolution_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 1 && out_elempack == 4)
+    {
+        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv3x3s2_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv7x7s2_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            convolution_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 4 && out_elempack == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv1x1s2_sgemm_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            convolution_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
+        }
+    }
+#endif // __loongarch_sx
+
+    if (elempack == 1 && out_elempack == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || !opt.use_winograd23_convolution)
+            {
+                conv3x3s1_winograd43_lsx(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt);
+            }
+            else if (opt.use_winograd23_convolution)
+            {
+                conv3x3s1_winograd23_lsx(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt);
+            }
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+        else
+        {
+            const int maxk = kernel_w * kernel_h;
+
+            // kernel offsets
+            std::vector<int> _space_ofs(maxk);
+            int* space_ofs = &_space_ofs[0];
+            {
+                int p1 = 0;
+                int p2 = 0;
+                int gap = w * dilation_h - kernel_w * dilation_w;
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        space_ofs[p1] = p2;
+                        p1++;
+                        p2 += dilation_w;
+                    }
+                    p2 += gap;
+                }
+            }
+
+            // num_output
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p = 0; p < num_output; p++)
+            {
+                float* outptr = top_blob.channel(p);
+
+                for (int i = 0; i < outh; i++)
+                {
+                    for (int j = 0; j < outw; j++)
+                    {
+                        float sum = 0.f;
+
+                        if (bias_term)
+                        {
+                            sum = bias_data[p];
+                        }
+
+                        const float* kptr = (const float*)weight_data_tm + maxk * channels * p;
+
+                        // channels
+                        for (int q = 0; q < channels; q++)
+                        {
+                            const Mat m = bottom_blob_bordered.channel(q);
+                            const float* sptr = m.row(i * stride_h) + j * stride_w;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                float val = sptr[space_ofs[k]];
+                                float wt = kptr[k];
+                                sum += val * wt;
+                            }
+
+                            kptr += maxk;
+                        }
+
+                        sum = activation_ss(sum, activation_type, activation_params);
+
+                        outptr[j] = sum;
+                    }
+
+                    outptr += outw;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+int Convolution_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& _weight_data = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    const int _kernel_w = _weight_data.w;
+    const int _kernel_h = _weight_data.h;
+    const int _num_output = _weight_data.c * _weight_data.elempack;
+
+    Mat weight_data_flattened;
+    flatten(_weight_data, weight_data_flattened, opt);
+    if (weight_data_flattened.empty())
+        return -100;
+
+    // weight_data_flattened as pack1
+    weight_data_flattened.w *= weight_data_flattened.elempack;
+    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+    weight_data_flattened.elempack = 1;
+
+    Mat bias_data_flattened;
+    if (bias_term)
+    {
+        const Mat& _bias_data = bottom_blobs[2];
+        flatten(_bias_data, bias_data_flattened, opt);
+        if (bias_data_flattened.empty())
+            return -100;
+
+        // bias_data_flattened as pack1
+        bias_data_flattened.w *= bias_data_flattened.elempack;
+        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
+        bias_data_flattened.elempack = 1;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+
+    ncnn::ParamDict pd;
+    pd.set(0, _num_output);
+    pd.set(1, _kernel_w);
+    pd.set(11, _kernel_h);
+    pd.set(2, dilation_w);
+    pd.set(21, dilation_h);
+    pd.set(3, stride_w);
+    pd.set(31, stride_h);
+    pd.set(4, pad_left);
+    pd.set(15, pad_right);
+    pd.set(14, pad_top);
+    pd.set(16, pad_bottom);
+    pd.set(18, pad_value);
+    pd.set(5, bias_term);
+    pd.set(6, weight_data_flattened.w);
+    pd.set(8, int8_scale_term);
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    op->load_param(pd);
+
+    ncnn::Mat weights[2];
+    weights[0] = weight_data_flattened;
+    weights[1] = bias_data_flattened;
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    op->create_pipeline(opt);
+
+    op->forward(bottom_blob, top_blob, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
+#if NCNN_INT8
+static void convolution_transform_kernel_packed_int8_lsx(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // src = kw-kh-inch-outch
+    // dst = pa-pb-kw-kh-inch/pa-outch/pb
+    {
+        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
+
+        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)elempack * out_elempack, elempack * out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            signed char* g00 = weight_data_tm.channel(q / out_elempack);
+
+            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
+            {
+                for (int k = 0; k < maxk; k++)
+                {
+                    for (int i = 0; i < out_elempack; i++)
+                    {
+                        for (int j = 0; j < elempack; j++)
+                        {
+                            const signed char* k00 = weight_data_r2.channel(q + i).row<const signed char>(p + j);
+
+                            g00[0] = k00[k];
+
+                            g00++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+int Convolution_loongarch::create_pipeline_int8_loongarch(const Option& opt)
+{
+    const int maxk = kernel_w * kernel_h;
+    const int num_input = weight_data_size / maxk / num_output;
+
+    int elempack = 1;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        elempack = num_input % 8 == 0 ? 8 : 1;
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __loongarch_sx
+
+#if __loongarch_sx
+    if (elempack == 8 && out_elempack == 4)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_winograd43_transform_kernel_pack8to4_int8_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt);
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            convolution_transform_kernel_packed_int8_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    if (elempack == 1 && out_elempack == 4)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            convolution_transform_kernel_packed_int8_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    if (elempack == 8 && out_elempack == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_winograd43_transform_kernel_pack8to1_int8_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt);
+        }
+        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            convolution_transform_kernel_packed_int8_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+#endif // __loongarch_sx
+
+    if (elempack == 1 && out_elempack == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            convolution_im2col_sgemm_transform_kernel_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            convolution_im2col_sgemm_transform_kernel_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_winograd43_transform_kernel_int8_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt);
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            weight_data_tm = weight_data;
+        }
+    }
+
+    scale_in_data.create(num_output);
+    for (int p = 0; p < num_output; p++)
+    {
+        // requantize and relu
+        float scale_in;
+        if (weight_data_int8_scales[p] == 0)
+            scale_in = 0;
+        else
+            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
+
+        scale_in_data[p] = scale_in;
+    }
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int Convolution_loongarch::forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int elembits = bottom_blob.elembits();
+
+    Mat bottom_blob_int8 = bottom_blob;
+    if (elembits != 8)
+    {
+        Option opt_q = opt;
+        opt_q.blob_allocator = opt.workspace_allocator;
+        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
+    }
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    int w = bottom_blob_bordered.w;
+    int h = bottom_blob_bordered.h;
+    int channels = bottom_blob_bordered.c;
+    int elempack = bottom_blob_bordered.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    int outw = (w - kernel_extent_w) / stride_w + 1;
+    int outh = (h - kernel_extent_h) / stride_h + 1;
+
+    bool use_int8_requantize = int8_scale_term > 100;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        if (use_int8_requantize)
+            out_elempack = num_output % 8 == 0 ? 8 : 1;
+        else
+            out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __loongarch_sx
+    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
+
+    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    const int num_input = channels * elempack;
+
+    int out_elempack_int32 = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __loongarch_sx
+
+    Mat top_blob_int32;
+    top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator);
+    if (top_blob_int32.empty())
+        return -100;
+
+#if __loongarch_sx
+    if (elempack == 8 && out_elempack_int32 == 4)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv1x1s2_sgemm_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_winograd43_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+        else
+        {
+            convolution_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+    }
+
+    if (elempack == 1 && out_elempack_int32 == 4)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv1x1s2_sgemm_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
+        {
+            convolution_im2col_sgemm_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+        else
+        {
+            convolution_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+    }
+
+    if (elempack == 8 && out_elempack_int32 == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv1x1s2_sgemm_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_winograd43_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
+        }
+        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
+        {
+            convolution_im2col_sgemm_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+        else
+        {
+            convolution_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+    }
+#endif // __loongarch_sx
+
+    if (elempack == 1 && out_elempack_int32 == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv1x1s2_sgemm_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_winograd43_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+        else
+        {
+            convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+    }
+
+    if (use_int8_requantize)
+    {
+        requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
+    }
+    else
+    {
+        dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
+
+        if (activation)
+        {
+            activation->forward_inplace(top_blob, opt);
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_INT8
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/convolution_loongarch.h b/src/layer/loongarch/convolution_loongarch.h
new file mode 100644
index 00000000000..a84281bf713
--- /dev/null
+++ b/src/layer/loongarch/convolution_loongarch.h
@@ -0,0 +1,56 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CONVOLUTION_LOONGARCH_H
+#define LAYER_CONVOLUTION_LOONGARCH_H
+
+#include "convolution.h"
+
+namespace ncnn {
+
+class Convolution_loongarch : virtual public Convolution
+{
+public:
+    Convolution_loongarch();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+protected:
+#if NCNN_INT8
+    int create_pipeline_int8_loongarch(const Option& opt);
+    int forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
+
+public:
+    Layer* activation;
+
+    Mat weight_data_tm;
+    Mat weight_sgemm_data;
+    Mat weight_winograd23_data;
+    Mat weight_winograd43_data;
+    Mat weight_winograd63_data;
+
+#if NCNN_INT8
+    Mat scale_in_data;
+#endif
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CONVOLUTION_LOONGARCH_H
diff --git a/src/layer/loongarch/convolution_pack1to4.h b/src/layer/loongarch/convolution_pack1to4.h
new file mode 100644
index 00000000000..b7e0123d5ed
--- /dev/null
+++ b/src/layer/loongarch/convolution_pack1to4.h
@@ -0,0 +1,90 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    const float* bias_data_ptr = bias_data;
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        float* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                if (bias_data_ptr)
+                {
+                    _sum = (__m128)__lsx_vld(bias_data_ptr + p * 4, 0);
+                }
+
+                const float* kptr = (const float*)weight_data_pack1ton + maxk * channels * p * 4;
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const float* sptr = m.row(i * stride_h) + j * stride_w;
+
+                    for (int k = 0; k < maxk; k++) // 29.23
+                    {
+                        __m128 _val = __lsx_vreplfr2vr_s(sptr[space_ofs[k]]);
+                        __m128 _w = (__m128)__lsx_vld(kptr, 0);
+                        _sum = __lsx_vfmadd_s(_w, _val, _sum);
+
+                        kptr += 4;
+                    }
+                }
+
+                _sum = activation_ps(_sum, activation_type, activation_params);
+
+                __lsx_vst(_sum, outptr + j * 4, 0);
+            }
+
+            outptr += outw * 4;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_pack1to4_int8.h b/src/layer/loongarch/convolution_pack1to4_int8.h
new file mode 100644
index 00000000000..b043503c2ac
--- /dev/null
+++ b/src/layer/loongarch/convolution_pack1to4_int8.h
@@ -0,0 +1,87 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128i _sum = __lsx_vreplgr2vr_w(0);
+
+                const signed char* kptr = weight_data_int8.channel(p);
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        __m128i _val = __lsx_vreplgr2vr_h((short)sptr[space_ofs[k]]);
+
+                        __m128i _w = __lsx_vld(kptr, 0);
+                        __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                        __m128i _s0 = __lsx_vmul_h(_val, _w16);
+                        __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0);
+
+                        _sum = __lsx_vadd_w(_sum, _s032);
+
+                        kptr += 4;
+                    }
+                }
+
+                __lsx_vst(_sum, outptr + j * 4, 0);
+            }
+
+            outptr += outw * 4;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_pack4.h b/src/layer/loongarch/convolution_pack4.h
new file mode 100644
index 00000000000..66a7863f015
--- /dev/null
+++ b/src/layer/loongarch/convolution_pack4.h
@@ -0,0 +1,102 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    const float* bias_data_ptr = bias_data;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        float* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                if (bias_data_ptr)
+                {
+                    _sum = (__m128)__lsx_vld(bias_data_ptr + p * 4, 0);
+                }
+
+                const float* kptr = (const float*)weight_data_pack4.channel(p);
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
+
+                    for (int k = 0; k < maxk; k++) // 29.23
+                    {
+                        const float* slptr = sptr + space_ofs[k] * 4;
+
+                        __m128 _val0 = __lsx_vreplfr2vr_s(slptr[0]);
+                        __m128 _val1 = __lsx_vreplfr2vr_s(slptr[1]);
+                        __m128 _val2 = __lsx_vreplfr2vr_s(slptr[2]);
+                        __m128 _val3 = __lsx_vreplfr2vr_s(slptr[3]);
+
+                        __m128 _w0 = (__m128)__lsx_vld(kptr, 0);
+                        __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0);
+                        __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0);
+                        __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0);
+
+                        _sum = __lsx_vfmadd_s(_w0, _val0, _sum);
+                        _sum = __lsx_vfmadd_s(_w1, _val1, _sum);
+                        _sum = __lsx_vfmadd_s(_w2, _val2, _sum);
+                        _sum = __lsx_vfmadd_s(_w3, _val3, _sum);
+
+                        kptr += 16;
+                    }
+                }
+
+                _sum = activation_ps(_sum, activation_type, activation_params);
+
+                __lsx_vst(_sum, outptr + j * 4, 0);
+            }
+
+            outptr += outw * 4;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_pack4to1.h b/src/layer/loongarch/convolution_pack4to1.h
new file mode 100644
index 00000000000..872759fc7f1
--- /dev/null
+++ b/src/layer/loongarch/convolution_pack4to1.h
@@ -0,0 +1,94 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4to1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    const float* bias_data_ptr = bias_data;
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        float* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                float sum = 0.f;
+
+                if (bias_data_ptr)
+                {
+                    sum = bias_data_ptr[p];
+                }
+
+                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                const float* kptr = (const float*)weight_data_pack4to1.channel(p);
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0);
+                        __m128 _w = (__m128)__lsx_vld(kptr, 0);
+                        _sum = __lsx_vfmadd_s(_w, _val, _sum);
+
+                        kptr += 4;
+                    }
+                }
+
+                sum += __lsx_reduce_fadd_s(_sum);
+
+                sum = activation_ss(sum, activation_type, activation_params);
+
+                outptr[j] = sum;
+            }
+
+            outptr += outw;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_pack8to1_int8.h b/src/layer/loongarch/convolution_pack8to1_int8.h
new file mode 100644
index 00000000000..c7463a472b6
--- /dev/null
+++ b/src/layer/loongarch/convolution_pack8to1_int8.h
@@ -0,0 +1,87 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128i _sum = __lsx_vreplgr2vr_w(0);
+
+                const signed char* kptr = weight_data_int8.channel(p);
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        __m128i _val = __lsx_vld(sptr + space_ofs[k] * 8, 0);
+                        __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                        __m128i _w = __lsx_vld(kptr, 0);
+                        __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                        __m128i _s0 = __lsx_vmul_h(_val16, _w16);
+
+                        _sum = __lsx_vadd_w(_sum, __lsx_vhaddw_w_h(_s0, _s0));
+
+                        kptr += 8;
+                    }
+                }
+
+                outptr[j] = __lsx_reduce_add_w(_sum);
+            }
+
+            outptr += outw;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_pack8to4_int8.h b/src/layer/loongarch/convolution_pack8to4_int8.h
new file mode 100644
index 00000000000..00d90387bbe
--- /dev/null
+++ b/src/layer/loongarch/convolution_pack8to4_int8.h
@@ -0,0 +1,120 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+                const signed char* kptr = weight_data_int8.channel(p);
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w * 8;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        __m128i _val = __lsx_vld(sptr + space_ofs[k] * 8, 0);
+                        __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                        __m128i _w01 = __lsx_vld(kptr, 0);
+                        __m128i _w23 = __lsx_vld(kptr + 16, 0);
+                        __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                        __m128i _extw23 = __lsx_vslti_b(_w23, 0);
+                        __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                        __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+                        __m128i _w2 = __lsx_vilvl_b(_extw23, _w23);
+                        __m128i _w3 = __lsx_vilvh_b(_extw23, _w23);
+
+                        __m128i _s0 = __lsx_vmul_h(_val16, _w0);
+                        __m128i _s1 = __lsx_vmul_h(_val16, _w1);
+                        __m128i _s2 = __lsx_vmul_h(_val16, _w2);
+                        __m128i _s3 = __lsx_vmul_h(_val16, _w3);
+
+                        _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0));
+                        _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1));
+                        _sum2 = __lsx_vadd_w(_sum2, __lsx_vhaddw_w_h(_s2, _s2));
+                        _sum3 = __lsx_vadd_w(_sum3, __lsx_vhaddw_w_h(_s3, _s3));
+
+                        kptr += 32;
+                    }
+                }
+
+                // transpose 4x4
+                {
+                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
+                    _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
+                    _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
+                    _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
+                    _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
+                    _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
+                    _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
+                    _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+                _sum2 = __lsx_vadd_w(_sum2, _sum3);
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum2);
+
+                __lsx_vst(_sum0, outptr + j * 4, 0);
+            }
+
+            outptr += outw * 4;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_sgemm.h b/src/layer/loongarch/convolution_sgemm.h
new file mode 100644
index 00000000000..7b74ceac14b
--- /dev/null
+++ b/src/layer/loongarch/convolution_sgemm.h
@@ -0,0 +1,650 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    const float* bias = _bias;
+
+    // permute
+    Mat tmp;
+    if (size >= 4)
+        tmp.create(4 * maxk, inch, size / 4 + size % 4, 4u, 1, opt.workspace_allocator);
+    else
+        tmp.create(maxk, inch, size, 4u, 1, opt.workspace_allocator);
+    {
+        int nn_size = size / 4;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = ii * 4;
+
+            float* tmpptr = tmp.channel(i / 4);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+#if __loongarch_sx
+                    __lsx_vst(__lsx_vld(img0, 0), tmpptr, 0);
+#else
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img0[1];
+                    tmpptr[2] = img0[2];
+                    tmpptr[3] = img0[3];
+#endif
+                    img0 += size;
+                    tmpptr += 4;
+                }
+            }
+        }
+
+        int remain_size_start = nn_size * 4;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            float* tmpptr = tmp.channel(i / 4 + i % 4);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    img0 += size;
+                    tmpptr += 1;
+                }
+            }
+        }
+    }
+
+#if __loongarch_sx
+    int nn_outch = outch >> 3;
+    int remain_outch_start = nn_outch << 3;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 8;
+
+        float* outptr0 = top_blob.channel(p);
+        float* outptr1 = top_blob.channel(p + 1);
+        float* outptr2 = top_blob.channel(p + 2);
+        float* outptr3 = top_blob.channel(p + 3);
+        float* outptr4 = top_blob.channel(p + 4);
+        float* outptr5 = top_blob.channel(p + 5);
+        float* outptr6 = top_blob.channel(p + 6);
+        float* outptr7 = top_blob.channel(p + 7);
+
+        const float zeros[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+        const float* biasptr = bias ? bias + p : zeros;
+
+        int i = 0;
+        for (; i + 3 < size; i += 4)
+        {
+            const float* tmpptr = tmp.channel(i / 4);
+            const float* kptr = kernel.channel(p / 8);
+
+            int nn = inch * maxk; // inch always > 0
+
+            __m128 _sum0 = __lsx_vreplfr2vr_s(biasptr[0]);
+            __m128 _sum1 = __lsx_vreplfr2vr_s(biasptr[1]);
+            __m128 _sum2 = __lsx_vreplfr2vr_s(biasptr[2]);
+            __m128 _sum3 = __lsx_vreplfr2vr_s(biasptr[3]);
+            __m128 _sum4 = __lsx_vreplfr2vr_s(biasptr[4]);
+            __m128 _sum5 = __lsx_vreplfr2vr_s(biasptr[5]);
+            __m128 _sum6 = __lsx_vreplfr2vr_s(biasptr[6]);
+            __m128 _sum7 = __lsx_vreplfr2vr_s(biasptr[7]);
+
+            for (int q = 0; q < nn; q++)
+            {
+                __builtin_prefetch(tmpptr + 16);
+                __builtin_prefetch(kptr + 32);
+                __m128 _val = (__m128)__lsx_vld(tmpptr, 0);
+                __m128i _w0123 = __lsx_vld(kptr, 0);
+                __m128i _w4567 = __lsx_vld(kptr + 4, 0);
+                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0);
+                _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1);
+                _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2);
+                _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3);
+                _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 0), _val, _sum4);
+                _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 1), _val, _sum5);
+                _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 2), _val, _sum6);
+                _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 3), _val, _sum7);
+
+                tmpptr += 4;
+                kptr += 8;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr1, 0);
+            __lsx_vst(_sum2, outptr2, 0);
+            __lsx_vst(_sum3, outptr3, 0);
+            __lsx_vst(_sum4, outptr4, 0);
+            __lsx_vst(_sum5, outptr5, 0);
+            __lsx_vst(_sum6, outptr6, 0);
+            __lsx_vst(_sum7, outptr7, 0);
+
+            outptr0 += 4;
+            outptr1 += 4;
+            outptr2 += 4;
+            outptr3 += 4;
+            outptr4 += 4;
+            outptr5 += 4;
+            outptr6 += 4;
+            outptr7 += 4;
+        }
+        for (; i < size; i++)
+        {
+            const float* tmpptr = tmp.channel(i / 4 + i % 4);
+            const float* kptr = kernel.channel(p / 8);
+
+            int nn = inch * maxk; // inch always > 0
+
+            float sum0 = biasptr[0];
+            float sum1 = biasptr[1];
+            float sum2 = biasptr[2];
+            float sum3 = biasptr[3];
+            float sum4 = biasptr[4];
+            float sum5 = biasptr[5];
+            float sum6 = biasptr[6];
+            float sum7 = biasptr[7];
+
+            for (int q = 0; q < nn; q++)
+            {
+                sum0 += tmpptr[0] * kptr[0];
+                sum1 += tmpptr[0] * kptr[1];
+                sum2 += tmpptr[0] * kptr[2];
+                sum3 += tmpptr[0] * kptr[3];
+                sum4 += tmpptr[0] * kptr[4];
+                sum5 += tmpptr[0] * kptr[5];
+                sum6 += tmpptr[0] * kptr[6];
+                sum7 += tmpptr[0] * kptr[7];
+                tmpptr++;
+                kptr += 8;
+            }
+
+            outptr0[0] = sum0;
+            outptr1[0] = sum1;
+            outptr2[0] = sum2;
+            outptr3[0] = sum3;
+            outptr4[0] = sum4;
+            outptr5[0] = sum5;
+            outptr6[0] = sum6;
+            outptr7[0] = sum7;
+
+            outptr0++;
+            outptr1++;
+            outptr2++;
+            outptr3++;
+            outptr4++;
+            outptr5++;
+            outptr6++;
+            outptr7++;
+        }
+    }
+
+    nn_outch = (outch - remain_outch_start) >> 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = remain_outch_start + pp * 4;
+
+        float* outptr0 = top_blob.channel(p);
+        float* outptr1 = top_blob.channel(p + 1);
+        float* outptr2 = top_blob.channel(p + 2);
+        float* outptr3 = top_blob.channel(p + 3);
+
+        const float zeros[4] = {0.f, 0.f, 0.f, 0.f};
+        const float* biasptr = bias ? bias + p : zeros;
+
+        int i = 0;
+        for (; i + 3 < size; i += 4)
+        {
+            const float* tmpptr = tmp.channel(i / 4);
+            const float* kptr = kernel.channel(p / 8 + (p % 8) / 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            __m128 _sum0 = __lsx_vreplfr2vr_s(biasptr[0]);
+            __m128 _sum1 = __lsx_vreplfr2vr_s(biasptr[1]);
+            __m128 _sum2 = __lsx_vreplfr2vr_s(biasptr[2]);
+            __m128 _sum3 = __lsx_vreplfr2vr_s(biasptr[3]);
+
+            for (int q = 0; q < nn; q++)
+            {
+                __builtin_prefetch(tmpptr + 16);
+                __builtin_prefetch(kptr + 16);
+                __m128 _val = (__m128)__lsx_vld(tmpptr, 0);
+                __m128i _w0123 = __lsx_vld(kptr, 0);
+                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0);
+                _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1);
+                _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2);
+                _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3);
+
+                tmpptr += 4;
+                kptr += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr1, 0);
+            __lsx_vst(_sum2, outptr2, 0);
+            __lsx_vst(_sum3, outptr3, 0);
+
+            outptr0 += 4;
+            outptr1 += 4;
+            outptr2 += 4;
+            outptr3 += 4;
+        }
+        for (; i < size; i++)
+        {
+            const float* tmpptr = tmp.channel(i / 4 + i % 4);
+            const float* kptr = kernel.channel(p / 8 + (p % 8) / 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            float sum0 = biasptr[0];
+            float sum1 = biasptr[1];
+            float sum2 = biasptr[2];
+            float sum3 = biasptr[3];
+
+            for (int q = 0; q < nn; q++)
+            {
+                sum0 += tmpptr[0] * kptr[0];
+                sum1 += tmpptr[0] * kptr[1];
+                sum2 += tmpptr[0] * kptr[2];
+                sum3 += tmpptr[0] * kptr[3];
+                tmpptr++;
+                kptr += 4;
+            }
+
+            outptr0[0] = sum0;
+            outptr1[0] = sum1;
+            outptr2[0] = sum2;
+            outptr3[0] = sum3;
+
+            outptr0++;
+            outptr1++;
+            outptr2++;
+            outptr3++;
+        }
+    }
+
+    remain_outch_start += nn_outch << 2;
+#else // __loongarch_sx
+    int nn_outch = outch >> 1;
+    int remain_outch_start = nn_outch << 1;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 2;
+
+        float* outptr0 = top_blob.channel(p);
+        float* outptr1 = top_blob.channel(p + 1);
+
+        const float zeros[2] = {0.f, 0.f};
+        const float* biasptr = bias ? bias + p : zeros;
+
+        int i = 0;
+        for (; i + 3 < size; i += 4)
+        {
+            const float* tmpptr = tmp.channel(i / 4);
+            const float* kptr = kernel.channel(p / 2);
+
+            int nn = inch * maxk; // inch always > 0
+
+            float sum00 = biasptr[0];
+            float sum01 = biasptr[0];
+            float sum02 = biasptr[0];
+            float sum03 = biasptr[0];
+            float sum10 = biasptr[1];
+            float sum11 = biasptr[1];
+            float sum12 = biasptr[1];
+            float sum13 = biasptr[1];
+
+            for (int q = 0; q < nn; q++)
+            {
+                __builtin_prefetch(tmpptr + 16);
+                __builtin_prefetch(kptr + 8);
+                float k0 = kptr[0];
+                float k1 = kptr[1];
+                sum00 += tmpptr[0] * k0;
+                sum01 += tmpptr[1] * k0;
+                sum02 += tmpptr[2] * k0;
+                sum03 += tmpptr[3] * k0;
+                sum10 += tmpptr[0] * k1;
+                sum11 += tmpptr[1] * k1;
+                sum12 += tmpptr[2] * k1;
+                sum13 += tmpptr[3] * k1;
+                tmpptr += 4;
+                kptr += 2;
+            }
+
+            outptr0[0] = sum00;
+            outptr0[1] = sum01;
+            outptr0[2] = sum02;
+            outptr0[3] = sum03;
+            outptr1[0] = sum10;
+            outptr1[1] = sum11;
+            outptr1[2] = sum12;
+            outptr1[3] = sum13;
+
+            outptr0 += 4;
+            outptr1 += 4;
+        }
+        for (; i < size; i++)
+        {
+            const float* tmpptr = tmp.channel(i / 4 + i % 4);
+            const float* kptr = kernel.channel(p / 2);
+
+            int nn = inch * maxk; // inch always > 0
+
+            float sum0 = biasptr[0];
+            float sum1 = biasptr[1];
+
+            for (int q = 0; q < nn; q++)
+            {
+                __builtin_prefetch(tmpptr + 4);
+                __builtin_prefetch(kptr + 8);
+                sum0 += tmpptr[0] * kptr[0];
+                sum1 += tmpptr[0] * kptr[1];
+                tmpptr++;
+                kptr += 2;
+            }
+
+            outptr0[0] = sum0;
+            outptr1[0] = sum1;
+
+            outptr0++;
+            outptr1++;
+        }
+    }
+#endif // __loongarch_sx
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        float* outptr0 = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        int i = 0;
+        for (; i + 3 < size; i += 4)
+        {
+            const float* tmpptr = tmp.channel(i / 4);
+#if __loongarch_sx
+            const float* kptr = kernel.channel(p / 8 + (p % 8) / 4 + p % 4);
+#else
+            const float* kptr = kernel.channel(p / 2 + p % 2);
+#endif
+
+            int nn = inch * maxk; // inch always > 0
+
+#if __loongarch_sx
+            __m128 _sum0 = __lsx_vreplfr2vr_s(bias0);
+
+            for (int q = 0; q < nn; q++)
+            {
+                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vld(tmpptr, 0), __lsx_vreplfr2vr_s(kptr[0]), _sum0);
+                tmpptr += 4;
+                kptr++;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+
+            outptr0 += 4;
+#else
+            float sum0 = bias0;
+            float sum1 = bias0;
+            float sum2 = bias0;
+            float sum3 = bias0;
+
+            for (int q = 0; q < nn; q++)
+            {
+                __builtin_prefetch(tmpptr + 16);
+                __builtin_prefetch(kptr + 4);
+                sum0 += tmpptr[0] * kptr[0];
+                sum1 += tmpptr[1] * kptr[0];
+                sum2 += tmpptr[2] * kptr[0];
+                sum3 += tmpptr[3] * kptr[0];
+                tmpptr += 4;
+                kptr++;
+            }
+
+            outptr0[0] = sum0;
+            outptr0[1] = sum1;
+            outptr0[2] = sum2;
+            outptr0[3] = sum3;
+
+            outptr0 += 4;
+#endif // __loongarch_sx
+        }
+        for (; i < size; i++)
+        {
+            const float* tmpptr = tmp.channel(i / 4 + i % 4);
+#if __loongarch_sx
+            const float* kptr = kernel.channel(p / 8 + (p % 8) / 4 + p % 4);
+#else
+            const float* kptr = kernel.channel(p / 2 + p % 2);
+#endif
+
+            int nn = inch * maxk; // inch always > 0
+
+            float sum0 = bias0;
+
+            for (int q = 0; q < nn; q++)
+            {
+                sum0 += tmpptr[0] * kptr[0];
+                tmpptr++;
+                kptr++;
+            }
+
+            outptr0[0] = sum0;
+
+            outptr0++;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // interleave
+    // src = maxk-inch-outch
+    // dst = 8b-maxk-inch-outch/8b
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+#if __loongarch_sx
+    kernel_tm.create(8 * maxk, inch, outch / 8 + (outch % 8) / 4 + outch % 4);
+#else
+    kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2);
+#endif
+
+    int q = 0;
+#if __loongarch_sx
+    for (; q + 7 < outch; q += 8)
+    {
+        const Mat k0 = kernel.channel(q);
+        const Mat k1 = kernel.channel(q + 1);
+        const Mat k2 = kernel.channel(q + 2);
+        const Mat k3 = kernel.channel(q + 3);
+        const Mat k4 = kernel.channel(q + 4);
+        const Mat k5 = kernel.channel(q + 5);
+        const Mat k6 = kernel.channel(q + 6);
+        const Mat k7 = kernel.channel(q + 7);
+
+        float* g00 = kernel_tm.channel(q / 8);
+
+        for (int p = 0; p < inch; p++)
+        {
+            const float* k00 = k0.row(p);
+            const float* k10 = k1.row(p);
+            const float* k20 = k2.row(p);
+            const float* k30 = k3.row(p);
+            const float* k40 = k4.row(p);
+            const float* k50 = k5.row(p);
+            const float* k60 = k6.row(p);
+            const float* k70 = k7.row(p);
+
+            for (int k = 0; k < maxk; k++)
+            {
+                g00[0] = k00[k];
+                g00[1] = k10[k];
+                g00[2] = k20[k];
+                g00[3] = k30[k];
+                g00[4] = k40[k];
+                g00[5] = k50[k];
+                g00[6] = k60[k];
+                g00[7] = k70[k];
+
+                g00 += 8;
+            }
+        }
+    }
+    for (; q + 3 < outch; q += 4)
+    {
+        const Mat k0 = kernel.channel(q);
+        const Mat k1 = kernel.channel(q + 1);
+        const Mat k2 = kernel.channel(q + 2);
+        const Mat k3 = kernel.channel(q + 3);
+
+        float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4);
+
+        for (int p = 0; p < inch; p++)
+        {
+            const float* k00 = k0.row(p);
+            const float* k10 = k1.row(p);
+            const float* k20 = k2.row(p);
+            const float* k30 = k3.row(p);
+
+            for (int k = 0; k < maxk; k++)
+            {
+                g00[0] = k00[k];
+                g00[1] = k10[k];
+                g00[2] = k20[k];
+                g00[3] = k30[k];
+
+                g00 += 4;
+            }
+        }
+    }
+#else
+    for (; q + 1 < outch; q += 2)
+    {
+        const Mat k0 = kernel.channel(q);
+        const Mat k1 = kernel.channel(q + 1);
+
+        float* g00 = kernel_tm.channel(q / 2);
+
+        for (int p = 0; p < inch; p++)
+        {
+            const float* k00 = k0.row(p);
+            const float* k10 = k1.row(p);
+
+            for (int k = 0; k < maxk; k++)
+            {
+                g00[0] = k00[k];
+                g00[1] = k10[k];
+
+                g00 += 2;
+            }
+        }
+    }
+#endif // __loongarch_sx
+    for (; q < outch; q++)
+    {
+        const Mat k0 = kernel.channel(q);
+
+#if __loongarch_sx
+        float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + q % 4);
+#else
+        float* g00 = kernel_tm.channel(q / 2 + q % 2);
+#endif
+
+        for (int p = 0; p < inch; p++)
+        {
+            const float* k00 = k0.row(p);
+
+            for (int k = 0; k < maxk; k++)
+            {
+                g00[0] = k00[k];
+
+                g00 += 1;
+            }
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator);
+    {
+        const int gap = w * stride_h - outw * stride_w;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            float* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const float* sptr = img.row<const float>(dilation_h * u) + dilation_w * v;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j < outw; j++)
+                        {
+                            ptr[0] = sptr[0];
+
+                            sptr += stride_w;
+                            ptr += 1;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/loongarch/convolution_sgemm_int8.h b/src/layer/loongarch/convolution_sgemm_int8.h
new file mode 100644
index 00000000000..98f47760901
--- /dev/null
+++ b/src/layer/loongarch/convolution_sgemm_int8.h
@@ -0,0 +1,800 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    // permute
+    Mat tmp;
+#if __loongarch_sx
+    if (inch >= 4)
+    {
+        if (size >= 2)
+            tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator);
+        else
+            tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator);
+    }
+    else
+#endif // __loongarch_sx
+    {
+        if (size >= 2)
+            tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator);
+        else
+            tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
+    }
+    {
+        int remain_size_start = 0;
+        int nn_size = (size - remain_size_start) >> 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 2;
+
+            signed char* tmpptr = tmp.channel(i / 2);
+
+            int q = 0;
+#if __loongarch_sx
+            for (; q + 3 < inch; q += 4)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
+                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
+                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img1[0];
+                    tmpptr[2] = img2[0];
+                    tmpptr[3] = img3[0];
+                    tmpptr[4] = img0[1];
+                    tmpptr[5] = img1[1];
+                    tmpptr[6] = img2[1];
+                    tmpptr[7] = img3[1];
+                    tmpptr += 8;
+
+                    img0 += size;
+                    img1 += size;
+                    img2 += size;
+                    img3 += size;
+                }
+            }
+#endif // __loongarch_sx
+            for (; q < inch; q++)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img0[1];
+
+                    tmpptr += 2;
+
+                    img0 += size;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+
+            int q = 0;
+#if __loongarch_sx
+            for (; q + 3 < inch; q += 4)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
+                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
+                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img1[0];
+                    tmpptr[2] = img2[0];
+                    tmpptr[3] = img3[0];
+                    tmpptr += 4;
+
+                    img0 += size;
+                    img1 += size;
+                    img2 += size;
+                    img3 += size;
+                }
+            }
+#endif // __loongarch_sx
+            for (; q < inch; q++)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+
+                    tmpptr += 1;
+
+                    img0 += size;
+                }
+            }
+        }
+    }
+
+#if __loongarch_sx
+    int nn_outch = outch >> 2;
+    int remain_outch_start = nn_outch << 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 4;
+
+        int* outptr0 = top_blob.channel(p);
+        int* outptr1 = top_blob.channel(p + 1);
+        int* outptr2 = top_blob.channel(p + 2);
+        int* outptr3 = top_blob.channel(p + 3);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p / 4);
+
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            __m128i _sum00 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum10 = __lsx_vreplgr2vr_w(0);
+
+            if (nn4 > 0)
+            {
+                __m128i _sum01 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum02 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum03 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum11 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum12 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum13 = __lsx_vreplgr2vr_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    __m128i _val = __lsx_vld(tmpptr, 0);
+                    __m128i _val01 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                    __m128i _val0 = __lsx_vilvl_d(_val01, _val01);
+                    __m128i _val1 = __lsx_vilvh_d(_val01, _val01);
+
+                    __m128i _w01 = __lsx_vld(kptr, 0);
+                    __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                    __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                    __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+
+                    __m128i _s00 = __lsx_vmul_h(_val0, _w0);
+                    __m128i _s01 = __lsx_vmul_h(_val0, _w1);
+                    __m128i _s10 = __lsx_vmul_h(_val1, _w0);
+                    __m128i _s11 = __lsx_vmul_h(_val1, _w1);
+
+                    __m128i _exts00 = __lsx_vslti_h(_s00, 0);
+                    __m128i _exts01 = __lsx_vslti_h(_s01, 0);
+                    __m128i _exts10 = __lsx_vslti_h(_s10, 0);
+                    __m128i _exts11 = __lsx_vslti_h(_s11, 0);
+                    __m128i _s00l = __lsx_vilvl_h(_exts00, _s00);
+                    __m128i _s00h = __lsx_vilvh_h(_exts00, _s00);
+                    __m128i _s01l = __lsx_vilvl_h(_exts01, _s01);
+                    __m128i _s01h = __lsx_vilvh_h(_exts01, _s01);
+                    __m128i _s10l = __lsx_vilvl_h(_exts10, _s10);
+                    __m128i _s10h = __lsx_vilvh_h(_exts10, _s10);
+                    __m128i _s11l = __lsx_vilvl_h(_exts11, _s11);
+                    __m128i _s11h = __lsx_vilvh_h(_exts11, _s11);
+
+                    _sum00 = __lsx_vadd_w(_sum00, _s00l);
+                    _sum01 = __lsx_vadd_w(_sum01, _s00h);
+                    _sum02 = __lsx_vadd_w(_sum02, _s01l);
+                    _sum03 = __lsx_vadd_w(_sum03, _s01h);
+                    _sum10 = __lsx_vadd_w(_sum10, _s10l);
+                    _sum11 = __lsx_vadd_w(_sum11, _s10h);
+                    _sum12 = __lsx_vadd_w(_sum12, _s11l);
+                    _sum13 = __lsx_vadd_w(_sum13, _s11h);
+
+                    tmpptr += 8;
+                    kptr += 16;
+                }
+
+                // transpose 4x4
+                {
+                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __lsx_vilvl_w(_sum01, _sum00);
+                    _tmp1 = __lsx_vilvl_w(_sum03, _sum02);
+                    _tmp2 = __lsx_vilvh_w(_sum01, _sum00);
+                    _tmp3 = __lsx_vilvh_w(_sum03, _sum02);
+                    _sum00 = __lsx_vilvl_d(_tmp1, _tmp0);
+                    _sum01 = __lsx_vilvh_d(_tmp1, _tmp0);
+                    _sum02 = __lsx_vilvl_d(_tmp3, _tmp2);
+                    _sum03 = __lsx_vilvh_d(_tmp3, _tmp2);
+                }
+                {
+                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __lsx_vilvl_w(_sum11, _sum10);
+                    _tmp1 = __lsx_vilvl_w(_sum13, _sum12);
+                    _tmp2 = __lsx_vilvh_w(_sum11, _sum10);
+                    _tmp3 = __lsx_vilvh_w(_sum13, _sum12);
+                    _sum10 = __lsx_vilvl_d(_tmp1, _tmp0);
+                    _sum11 = __lsx_vilvh_d(_tmp1, _tmp0);
+                    _sum12 = __lsx_vilvl_d(_tmp3, _tmp2);
+                    _sum13 = __lsx_vilvh_d(_tmp3, _tmp2);
+                }
+
+                _sum00 = __lsx_vadd_w(_sum00, _sum01);
+                _sum02 = __lsx_vadd_w(_sum02, _sum03);
+                _sum10 = __lsx_vadd_w(_sum10, _sum11);
+                _sum12 = __lsx_vadd_w(_sum12, _sum13);
+
+                _sum00 = __lsx_vadd_w(_sum00, _sum02);
+                _sum10 = __lsx_vadd_w(_sum10, _sum12);
+            }
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                __m128i _val0 = __lsx_vreplgr2vr_h(tmpptr[0]);
+                __m128i _val1 = __lsx_vreplgr2vr_h(tmpptr[1]);
+                __m128i _val = __lsx_vilvl_d(_val1, _val0);
+
+                __m128i _w = __lsx_vld(kptr, 0);
+                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                _w16 = __lsx_vilvl_d(_w16, _w16);
+
+                __m128i _s0 = __lsx_vmul_h(_val, _w16);
+                __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+
+                _sum00 = __lsx_vadd_w(_sum00, _s0l);
+                _sum10 = __lsx_vadd_w(_sum10, _s0h);
+
+                tmpptr += 2;
+                kptr += 4;
+            }
+
+            int sum[8];
+            __lsx_vst(_sum00, sum, 0);
+            __lsx_vst(_sum10, sum + 4, 0);
+
+            outptr0[0] = sum[0];
+            outptr1[0] = sum[1];
+            outptr2[0] = sum[2];
+            outptr3[0] = sum[3];
+            outptr0[1] = sum[4];
+            outptr1[1] = sum[5];
+            outptr2[1] = sum[6];
+            outptr3[1] = sum[7];
+            outptr0 += 2;
+            outptr1 += 2;
+            outptr2 += 2;
+            outptr3 += 2;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p / 4);
+
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+
+            if (nn4 > 0)
+            {
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    __m128i _val = __lsx_vld(tmpptr, 0);
+                    __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                    _val16 = __lsx_vilvl_d(_val16, _val16);
+
+                    __m128i _w01 = __lsx_vld(kptr, 0);
+                    __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                    __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                    __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+
+                    __m128i _s0 = __lsx_vmul_h(_val16, _w0);
+                    __m128i _s1 = __lsx_vmul_h(_val16, _w1);
+
+                    __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                    __m128i _exts1 = __lsx_vslti_h(_s1, 0);
+                    __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                    __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+                    __m128i _s1l = __lsx_vilvl_h(_exts1, _s1);
+                    __m128i _s1h = __lsx_vilvh_h(_exts1, _s1);
+
+                    _sum0 = __lsx_vadd_w(_sum0, _s0l);
+                    _sum1 = __lsx_vadd_w(_sum1, _s0h);
+                    _sum2 = __lsx_vadd_w(_sum2, _s1l);
+                    _sum3 = __lsx_vadd_w(_sum3, _s1h);
+
+                    tmpptr += 4;
+                    kptr += 16;
+                }
+
+                // transpose 4x4
+                {
+                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
+                    _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
+                    _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
+                    _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
+                    _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
+                    _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
+                    _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
+                    _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+                _sum2 = __lsx_vadd_w(_sum2, _sum3);
+                _sum0 = __lsx_vadd_w(_sum0, _sum2);
+            }
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                __m128i _val = __lsx_vreplgr2vr_h(tmpptr[0]);
+
+                __m128i _w = __lsx_vld(kptr, 0);
+                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                __m128i _s0 = __lsx_vmul_h(_val, _w16);
+                __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0);
+
+                _sum0 = __lsx_vadd_w(_sum0, _s032);
+
+                tmpptr += 1;
+                kptr += 4;
+            }
+
+            int sum[4];
+            __lsx_vst(_sum0, sum, 0);
+
+            outptr0[0] = sum[0];
+            outptr1[0] = sum[1];
+            outptr2[0] = sum[2];
+            outptr3[0] = sum[3];
+            outptr0 += 1;
+            outptr1 += 1;
+            outptr2 += 1;
+            outptr3 += 1;
+        }
+    }
+#else // __loongarch_sx
+    int nn_outch = outch >> 1;
+    int remain_outch_start = nn_outch << 1;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 2;
+
+        int* outptr0 = top_blob.channel(p);
+        int* outptr1 = top_blob.channel(p + 1);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p / 2);
+
+            int sum00 = 0;
+            int sum01 = 0;
+            int sum10 = 0;
+            int sum11 = 0;
+
+            int nn1 = inch * maxk;
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                signed char val0 = tmpptr[0];
+                signed char val1 = tmpptr[1];
+                signed char w0 = kptr[0];
+                signed char w1 = kptr[1];
+
+                sum00 += val0 * w0;
+                sum01 += val1 * w0;
+                sum10 += val0 * w1;
+                sum11 += val1 * w1;
+
+                tmpptr += 2;
+                kptr += 2;
+            }
+
+            outptr0[0] = sum00;
+            outptr0[1] = sum01;
+            outptr1[0] = sum10;
+            outptr1[1] = sum11;
+            outptr0 += 2;
+            outptr1 += 2;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p / 2);
+
+            int sum00 = 0;
+            int sum10 = 0;
+
+            int nn1 = inch * maxk;
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                signed char val0 = tmpptr[0];
+                signed char w0 = kptr[0];
+                signed char w1 = kptr[1];
+
+                sum00 += val0 * w0;
+                sum10 += val0 * w1;
+
+                tmpptr += 1;
+                kptr += 2;
+            }
+
+            outptr0[0] = sum00;
+            outptr1[0] = sum10;
+            outptr0 += 1;
+            outptr1 += 1;
+        }
+    }
+#endif // __loongarch_sx
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        int* outptr0 = top_blob.channel(p);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+#if __loongarch_sx
+            const signed char* kptr = kernel.channel(p / 4 + p % 4);
+#else
+            const signed char* kptr = kernel.channel(p / 2 + p % 2);
+#endif
+
+            int sum0 = 0;
+            int sum1 = 0;
+
+#if __loongarch_sx
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            if (nn4 > 0)
+            {
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    __m128i _val = __lsx_vld(tmpptr, 0);
+                    __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                    __m128i _w = __lsx_vld(kptr, 0);
+                    __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                    _w16 = __lsx_vilvl_d(_w16, _w16);
+
+                    __m128i _s0 = __lsx_vmul_h(_val16, _w16);
+                    __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                    __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                    __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+
+                    _sum0 = __lsx_vadd_w(_sum0, _s0l);
+                    _sum1 = __lsx_vadd_w(_sum1, _s0h);
+
+                    tmpptr += 8;
+                    kptr += 4;
+                }
+
+                sum0 = __lsx_reduce_add_w(_sum0);
+                sum1 = __lsx_reduce_add_w(_sum1);
+            }
+#else
+            int nn1 = inch * maxk;
+#endif // __loongarch_sx
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                signed char val0 = tmpptr[0];
+                signed char val1 = tmpptr[1];
+                signed char w = kptr[0];
+
+                sum0 += val0 * w;
+                sum1 += val1 * w;
+
+                tmpptr += 2;
+                kptr += 1;
+            }
+
+            outptr0[0] = sum0;
+            outptr0[1] = sum1;
+            outptr0 += 2;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+#if __loongarch_sx
+            const signed char* kptr = kernel.channel(p / 4 + p % 4);
+#else
+            const signed char* kptr = kernel.channel(p / 2 + p % 2);
+#endif
+
+            int sum = 0;
+
+#if __loongarch_sx
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            if (nn4 > 0)
+            {
+                __m128i _sum = __lsx_vreplgr2vr_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    __m128i _val = __lsx_vld(tmpptr, 0);
+                    __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                    __m128i _w = __lsx_vld(kptr, 0);
+                    __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                    __m128i _s0 = __lsx_vmul_h(_val16, _w16);
+                    __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0);
+
+                    _sum = __lsx_vadd_w(_sum, _s032);
+
+                    tmpptr += 4;
+                    kptr += 4;
+                }
+
+                sum = __lsx_reduce_add_w(_sum);
+            }
+#else
+            int nn1 = inch * maxk;
+#endif // __loongarch_sx
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                signed char val = tmpptr[0];
+                signed char w = kptr[0];
+
+                sum += val * w;
+
+                tmpptr += 1;
+                kptr += 1;
+            }
+
+            outptr0[0] = sum;
+            outptr0 += 1;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // interleave
+    // src = maxk-inch-outch
+    // dst = 4a-4b-maxk-inch/4a-outch/4b
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+#if __loongarch_sx
+    if (outch >= 4)
+    {
+        if (inch >= 4)
+            kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4 + outch % 4, (size_t)1u);
+        else
+            kernel_tm.create(4 * maxk, inch, outch / 4 + outch % 4, (size_t)1u);
+    }
+#else
+    if (outch >= 2)
+    {
+        kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2, (size_t)1u);
+    }
+#endif // __loongarch_sx
+    else
+    {
+#if __loongarch_sx
+        if (inch >= 4)
+            kernel_tm.create(4 * maxk, inch / 4 + inch % 4, outch, (size_t)1u);
+        else
+#endif // __loongarch_sx
+        {
+            kernel_tm.create(1 * maxk, inch, outch, (size_t)1u);
+        }
+    }
+
+    int q = 0;
+#if __loongarch_sx
+    for (; q + 3 < outch; q += 4)
+    {
+        signed char* g00 = kernel_tm.channel(q / 4);
+
+        int p = 0;
+        for (; p + 3 < inch; p += 4)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 4; j++)
+                    {
+                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);
+                        g00[0] = k00[k];
+                        g00++;
+                    }
+                }
+            }
+        }
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    const signed char* k00 = kernel.channel(q + i).row<const signed char>(p);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+    }
+#else  // __loongarch_sx
+    for (; q + 1 < outch; q += 2)
+    {
+        signed char* g00 = kernel_tm.channel(q / 2);
+
+        int p = 0;
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 2; i++)
+                {
+                    const signed char* k00 = kernel.channel(q + i).row<const signed char>(p);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+    }
+#endif // __loongarch_sx
+    for (; q < outch; q++)
+    {
+#if __loongarch_sx
+        signed char* g00 = kernel_tm.channel(q / 4 + q % 4);
+#else
+        signed char* g00 = kernel_tm.channel(q / 2 + q % 2);
+#endif
+
+        int p = 0;
+#if __loongarch_sx
+        for (; p + 3 < inch; p += 4)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int j = 0; j < 4; j++)
+                {
+                    const signed char* k00 = kernel.channel(q).row<const signed char>(p + j);
+                    g00[0] = k00[k];
+                    g00++;
+                }
+            }
+        }
+#endif // __loongarch_sx
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* k00 = kernel.channel(q).row<const signed char>(p);
+                g00[0] = k00[k];
+                g00++;
+            }
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
+    {
+        const int gap = w * stride_h - outw * stride_w;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            signed char* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const signed char* sptr = img.row<const signed char>(dilation_h * u) + dilation_w * v;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j + 3 < outw; j += 4)
+                        {
+                            ptr[0] = sptr[0];
+                            ptr[1] = sptr[stride_w];
+                            ptr[2] = sptr[stride_w * 2];
+                            ptr[3] = sptr[stride_w * 3];
+
+                            sptr += stride_w * 4;
+                            ptr += 4;
+                        }
+                        for (; j + 1 < outw; j += 2)
+                        {
+                            ptr[0] = sptr[0];
+                            ptr[1] = sptr[stride_w];
+
+                            sptr += stride_w * 2;
+                            ptr += 2;
+                        }
+                        for (; j < outw; j++)
+                        {
+                            ptr[0] = sptr[0];
+
+                            sptr += stride_w;
+                            ptr += 1;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_int8_lsx(bottom_im2col, top_blob, kernel, opt);
+}
diff --git a/src/layer/loongarch/convolution_sgemm_pack1to4_int8.h b/src/layer/loongarch/convolution_sgemm_pack1to4_int8.h
new file mode 100644
index 00000000000..3429bfae5fa
--- /dev/null
+++ b/src/layer/loongarch/convolution_sgemm_pack1to4_int8.h
@@ -0,0 +1,481 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_pack1to4_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    // permute
+    Mat tmp;
+    if (inch >= 4)
+    {
+        if (size >= 2)
+            tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator);
+        else
+            tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator);
+    }
+    else
+    {
+        if (size >= 2)
+            tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator);
+        else
+            tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
+    }
+    {
+        int remain_size_start = 0;
+        int nn_size = (size - remain_size_start) >> 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 2;
+
+            signed char* tmpptr = tmp.channel(i / 2);
+
+            int q = 0;
+            for (; q + 3 < inch; q += 4)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
+                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
+                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img1[0];
+                    tmpptr[2] = img2[0];
+                    tmpptr[3] = img3[0];
+                    tmpptr[4] = img0[1];
+                    tmpptr[5] = img1[1];
+                    tmpptr[6] = img2[1];
+                    tmpptr[7] = img3[1];
+                    tmpptr += 8;
+
+                    img0 += size;
+                    img1 += size;
+                    img2 += size;
+                    img3 += size;
+                }
+            }
+            for (; q < inch; q++)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img0[1];
+
+                    tmpptr += 2;
+
+                    img0 += size;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+
+            int q = 0;
+            for (; q + 3 < inch; q += 4)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
+                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
+                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img1[0];
+                    tmpptr[2] = img2[0];
+                    tmpptr[3] = img3[0];
+                    tmpptr += 4;
+
+                    img0 += size;
+                    img1 += size;
+                    img2 += size;
+                    img3 += size;
+                }
+            }
+            for (; q < inch; q++)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+
+                    tmpptr += 1;
+
+                    img0 += size;
+                }
+            }
+        }
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr0 = top_blob.channel(p);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p);
+
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            __m128i _sum00 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum10 = __lsx_vreplgr2vr_w(0);
+
+            if (nn4 > 0)
+            {
+                __m128i _sum01 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum02 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum03 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum11 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum12 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum13 = __lsx_vreplgr2vr_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    __builtin_prefetch(tmpptr + 32);
+                    __builtin_prefetch(kptr + 64);
+                    __m128i _val = __lsx_vld(tmpptr, 0);
+                    __m128i _val01 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                    __m128i _val0 = __lsx_vilvl_d(_val01, _val01);
+                    __m128i _val1 = __lsx_vilvh_d(_val01, _val01);
+
+                    __m128i _w01 = __lsx_vld(kptr, 0);
+                    __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                    __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                    __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+
+                    __m128i _s00 = __lsx_vmul_h(_val0, _w0);
+                    __m128i _s01 = __lsx_vmul_h(_val0, _w1);
+                    __m128i _s10 = __lsx_vmul_h(_val1, _w0);
+                    __m128i _s11 = __lsx_vmul_h(_val1, _w1);
+
+                    __m128i _exts00 = __lsx_vslti_h(_s00, 0);
+                    __m128i _exts01 = __lsx_vslti_h(_s01, 0);
+                    __m128i _exts10 = __lsx_vslti_h(_s10, 0);
+                    __m128i _exts11 = __lsx_vslti_h(_s11, 0);
+                    __m128i _s00l = __lsx_vilvl_h(_exts00, _s00);
+                    __m128i _s00h = __lsx_vilvh_h(_exts00, _s00);
+                    __m128i _s01l = __lsx_vilvl_h(_exts01, _s01);
+                    __m128i _s01h = __lsx_vilvh_h(_exts01, _s01);
+                    __m128i _s10l = __lsx_vilvl_h(_exts10, _s10);
+                    __m128i _s10h = __lsx_vilvh_h(_exts10, _s10);
+                    __m128i _s11l = __lsx_vilvl_h(_exts11, _s11);
+                    __m128i _s11h = __lsx_vilvh_h(_exts11, _s11);
+
+                    _sum00 = __lsx_vadd_w(_sum00, _s00l);
+                    _sum01 = __lsx_vadd_w(_sum01, _s00h);
+                    _sum02 = __lsx_vadd_w(_sum02, _s01l);
+                    _sum03 = __lsx_vadd_w(_sum03, _s01h);
+                    _sum10 = __lsx_vadd_w(_sum10, _s10l);
+                    _sum11 = __lsx_vadd_w(_sum11, _s10h);
+                    _sum12 = __lsx_vadd_w(_sum12, _s11l);
+                    _sum13 = __lsx_vadd_w(_sum13, _s11h);
+
+                    tmpptr += 8;
+                    kptr += 16;
+                }
+
+                // transpose 4x4
+                {
+                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __lsx_vilvl_w(_sum01, _sum00);
+                    _tmp1 = __lsx_vilvl_w(_sum03, _sum02);
+                    _tmp2 = __lsx_vilvh_w(_sum01, _sum00);
+                    _tmp3 = __lsx_vilvh_w(_sum03, _sum02);
+                    _sum00 = __lsx_vilvl_d(_tmp1, _tmp0);
+                    _sum01 = __lsx_vilvh_d(_tmp1, _tmp0);
+                    _sum02 = __lsx_vilvl_d(_tmp3, _tmp2);
+                    _sum03 = __lsx_vilvh_d(_tmp3, _tmp2);
+                }
+                {
+                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __lsx_vilvl_w(_sum11, _sum10);
+                    _tmp1 = __lsx_vilvl_w(_sum13, _sum12);
+                    _tmp2 = __lsx_vilvh_w(_sum11, _sum10);
+                    _tmp3 = __lsx_vilvh_w(_sum13, _sum12);
+                    _sum10 = __lsx_vilvl_d(_tmp1, _tmp0);
+                    _sum11 = __lsx_vilvh_d(_tmp1, _tmp0);
+                    _sum12 = __lsx_vilvl_d(_tmp3, _tmp2);
+                    _sum13 = __lsx_vilvh_d(_tmp3, _tmp2);
+                }
+
+                _sum00 = __lsx_vadd_w(_sum00, _sum01);
+                _sum02 = __lsx_vadd_w(_sum02, _sum03);
+                _sum10 = __lsx_vadd_w(_sum10, _sum11);
+                _sum12 = __lsx_vadd_w(_sum12, _sum13);
+
+                _sum00 = __lsx_vadd_w(_sum00, _sum02);
+                _sum10 = __lsx_vadd_w(_sum10, _sum12);
+            }
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                __m128i _val0 = __lsx_vreplgr2vr_h(tmpptr[0]);
+                __m128i _val1 = __lsx_vreplgr2vr_h(tmpptr[1]);
+                __m128i _val = __lsx_vilvl_d(_val1, _val0);
+
+                __m128i _w = __lsx_vld(kptr, 0);
+                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                _w16 = __lsx_vilvl_d(_w16, _w16);
+
+                __m128i _s0 = __lsx_vmul_h(_val, _w16);
+                __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+
+                _sum00 = __lsx_vadd_w(_sum00, _s0l);
+                _sum10 = __lsx_vadd_w(_sum10, _s0h);
+
+                tmpptr += 2;
+                kptr += 4;
+            }
+
+            __lsx_vst(_sum00, outptr0, 0);
+            __lsx_vst(_sum10, outptr0 + 4, 0);
+            outptr0 += 8;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p);
+
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+
+            if (nn4 > 0)
+            {
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    __builtin_prefetch(tmpptr + 16);
+                    __builtin_prefetch(kptr + 64);
+                    __m128i _val = __lsx_vld(tmpptr, 0);
+                    __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                    _val16 = __lsx_vilvl_d(_val16, _val16);
+
+                    __m128i _w01 = __lsx_vld(kptr, 0);
+                    __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                    __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                    __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+
+                    __m128i _s0 = __lsx_vmul_h(_val16, _w0);
+                    __m128i _s1 = __lsx_vmul_h(_val16, _w1);
+
+                    __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                    __m128i _exts1 = __lsx_vslti_h(_s1, 0);
+                    __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                    __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+                    __m128i _s1l = __lsx_vilvl_h(_exts1, _s1);
+                    __m128i _s1h = __lsx_vilvh_h(_exts1, _s1);
+
+                    _sum0 = __lsx_vadd_w(_sum0, _s0l);
+                    _sum1 = __lsx_vadd_w(_sum1, _s0h);
+                    _sum2 = __lsx_vadd_w(_sum2, _s1l);
+                    _sum3 = __lsx_vadd_w(_sum3, _s1h);
+
+                    tmpptr += 4;
+                    kptr += 16;
+                }
+
+                // transpose 4x4
+                {
+                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
+                    _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
+                    _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
+                    _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
+                    _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
+                    _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
+                    _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
+                    _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+                _sum2 = __lsx_vadd_w(_sum2, _sum3);
+                _sum0 = __lsx_vadd_w(_sum0, _sum2);
+            }
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                __m128i _val = __lsx_vreplgr2vr_h(tmpptr[0]);
+
+                __m128i _w = __lsx_vld(kptr, 0);
+                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                __m128i _s0 = __lsx_vmul_h(_val, _w16);
+                __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0);
+
+                _sum0 = __lsx_vadd_w(_sum0, _s032);
+
+                tmpptr += 1;
+                kptr += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            outptr0 += 4;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // interleave
+    // src = maxk-inch-outch
+    // dst = 4a-4b-maxk-inch/4a-outch/4b
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+    if (inch >= 4)
+        kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4, (size_t)1u);
+    else
+        kernel_tm.create(4 * maxk, inch, outch / 4, (size_t)1u);
+
+    for (int q = 0; q + 3 < outch; q += 4)
+    {
+        signed char* g00 = kernel_tm.channel(q / 4);
+
+        int p = 0;
+        for (; p + 3 < inch; p += 4)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 4; j++)
+                    {
+                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);
+
+                        g00[0] = k00[k];
+
+                        g00++;
+                    }
+                }
+            }
+        }
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    const signed char* k00 = kernel.channel(q + i).row<const signed char>(p);
+
+                    g00[0] = k00[k];
+
+                    g00++;
+                }
+            }
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
+    {
+        const int gap = w * stride_h - outw * stride_w;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            signed char* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const signed char* sptr = img.row<const signed char>(dilation_h * u) + dilation_w * v;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j + 3 < outw; j += 4)
+                        {
+                            ptr[0] = sptr[0];
+                            ptr[1] = sptr[stride_w];
+                            ptr[2] = sptr[stride_w * 2];
+                            ptr[3] = sptr[stride_w * 3];
+
+                            sptr += stride_w * 4;
+                            ptr += 4;
+                        }
+                        for (; j + 1 < outw; j += 2)
+                        {
+                            ptr[0] = sptr[0];
+                            ptr[1] = sptr[stride_w];
+
+                            sptr += stride_w * 2;
+                            ptr += 2;
+                        }
+                        for (; j < outw; j++)
+                        {
+                            ptr[0] = sptr[0];
+
+                            sptr += stride_w;
+                            ptr += 1;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack1to4_int8_lsx(bottom_im2col, top_blob, kernel, opt);
+}
diff --git a/src/layer/loongarch/convolution_sgemm_pack4.h b/src/layer/loongarch/convolution_sgemm_pack4.h
new file mode 100644
index 00000000000..e3e7279a5d2
--- /dev/null
+++ b/src/layer/loongarch/convolution_sgemm_pack4.h
@@ -0,0 +1,519 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_pack4_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    const float* bias = _bias;
+
+    // permute
+    Mat tmp;
+    if (size >= 12)
+        tmp.create(12 * maxk, inch, size / 12 + (size % 12) / 8 + (size % 12 % 8) / 4 + (size % 12 % 4) / 2 + size % 12 % 2, 4u * 4, 4, opt.workspace_allocator);
+    else if (size >= 8)
+        tmp.create(8 * maxk, inch, size / 8 + (size % 8) / 4 + (size % 4) / 2 + size % 2, 4u * 4, 4, opt.workspace_allocator);
+    else if (size >= 4)
+        tmp.create(4 * maxk, inch, size / 4 + (size % 4) / 2 + size % 2, 4u * 4, 4, opt.workspace_allocator);
+    else if (size >= 2)
+        tmp.create(2 * maxk, inch, size / 2 + size % 2, 4u * 4, 4, opt.workspace_allocator);
+    else
+        tmp.create(maxk, inch, size, 4u * 4, 4, opt.workspace_allocator);
+    {
+        int remain_size_start = 0;
+        int nn_size = size / 12;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 12;
+
+            float* tmpptr = tmp.channel(i / 12);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    // transpose 4x12
+                    __m128i _r0 = __lsx_vld(img0, 0);
+                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
+                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);
+                    __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0);
+                    __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0);
+                    __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0);
+                    __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0);
+                    __m128i _r8 = __lsx_vld(img0 + 4 * 8, 0);
+                    __m128i _r9 = __lsx_vld(img0 + 4 * 9, 0);
+                    __m128i _ra = __lsx_vld(img0 + 4 * 10, 0);
+                    __m128i _rb = __lsx_vld(img0 + 4 * 11, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
+                    __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
+                    __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
+                    __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
+                    __m128i _r89r = __lsx_vilvl_w(_r9, _r8);
+                    __m128i _r89l = __lsx_vilvh_w(_r9, _r8);
+                    __m128i _rabr = __lsx_vilvl_w(_rb, _ra);
+                    __m128i _rabl = __lsx_vilvh_w(_rb, _ra);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+                    __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
+                    __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
+                    __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
+                    __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);
+                    __m128i _r89ab_0 = __lsx_vilvl_d(_rabr, _r89r);
+                    __m128i _r89ab_1 = __lsx_vilvh_d(_rabr, _r89r);
+                    __m128i _r89ab_2 = __lsx_vilvl_d(_rabl, _r89l);
+                    __m128i _r89ab_3 = __lsx_vilvh_d(_rabl, _r89l);
+
+                    __lsx_vst(_r0123_0, tmpptr, 0);
+                    __lsx_vst(_r4567_0, tmpptr + 4, 0);
+                    __lsx_vst(_r89ab_0, tmpptr + 4 * 2, 0);
+                    __lsx_vst(_r0123_1, tmpptr + 4 * 3, 0);
+                    __lsx_vst(_r4567_1, tmpptr + 4 * 4, 0);
+                    __lsx_vst(_r89ab_1, tmpptr + 4 * 5, 0);
+                    __lsx_vst(_r0123_2, tmpptr + 4 * 6, 0);
+                    __lsx_vst(_r4567_2, tmpptr + 4 * 7, 0);
+                    __lsx_vst(_r89ab_2, tmpptr + 4 * 8, 0);
+                    __lsx_vst(_r0123_3, tmpptr + 4 * 9, 0);
+                    __lsx_vst(_r4567_3, tmpptr + 4 * 10, 0);
+                    __lsx_vst(_r89ab_3, tmpptr + 4 * 11, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 48;
+                }
+            }
+        }
+
+        remain_size_start += nn_size * 12;
+        nn_size = (size - remain_size_start) >> 3;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 8;
+
+            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    // transpose 4x8
+                    __m128i _r0 = __lsx_vld(img0, 0);
+                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
+                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);
+                    __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0);
+                    __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0);
+                    __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0);
+                    __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
+                    __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
+                    __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
+                    __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+                    __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
+                    __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
+                    __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
+                    __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);
+
+                    __lsx_vst(_r0123_0, tmpptr, 0);
+                    __lsx_vst(_r4567_0, tmpptr + 4, 0);
+                    __lsx_vst(_r0123_1, tmpptr + 4 * 2, 0);
+                    __lsx_vst(_r4567_1, tmpptr + 4 * 3, 0);
+                    __lsx_vst(_r0123_2, tmpptr + 4 * 4, 0);
+                    __lsx_vst(_r4567_2, tmpptr + 4 * 5, 0);
+                    __lsx_vst(_r0123_3, tmpptr + 4 * 6, 0);
+                    __lsx_vst(_r4567_3, tmpptr + 4 * 7, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 32;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 3;
+        nn_size = (size - remain_size_start) >> 2;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 4;
+
+            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    // transpose 4x4
+                    __m128i _r0 = __lsx_vld(img0, 0);
+                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
+                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                    __lsx_vst(_r0123_0, tmpptr, 0);
+                    __lsx_vst(_r0123_1, tmpptr + 4, 0);
+                    __lsx_vst(_r0123_2, tmpptr + 4 * 2, 0);
+                    __lsx_vst(_r0123_3, tmpptr + 4 * 3, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 16;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 2;
+        nn_size = (size - remain_size_start) >> 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 2;
+
+            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    // transpose 4x2
+                    __m128i _r0 = __lsx_vld(img0, 0);
+                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
+
+                    __m128i _r01_0 = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01_1 = __lsx_vilvh_w(_r1, _r0);
+
+                    __lsx_vst(_r01_0, tmpptr, 0);
+                    __lsx_vst(_r01_1, tmpptr + 4, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 8;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    __m128i _val = __lsx_vld(img0, 0);
+                    __lsx_vst(_val, tmpptr, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 4;
+                }
+            }
+        }
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        float* outptr0 = top_blob.channel(p);
+
+        int i = 0;
+        for (; i + 11 < size; i += 12)
+        {
+            const float* tmpptr = tmp.channel(i / 12);
+            const float* kptr0 = kernel.channel(p);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum1 = _sum0;
+            __m128 _sum2 = _sum0;
+            __m128 _sum3 = _sum0;
+            __m128 _sum4 = _sum0;
+            __m128 _sum5 = _sum0;
+            __m128 _sum6 = _sum0;
+            __m128 _sum7 = _sum0;
+            __m128 _sum8 = _sum0;
+            __m128 _sum9 = _sum0;
+            __m128 _suma = _sum0;
+            __m128 _sumb = _sum0;
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 48);
+                __builtin_prefetch(kptr0 + 16);
+                __m128i _val0123 = __lsx_vld(tmpptr, 0);
+                __m128i _val4567 = __lsx_vld(tmpptr + 4, 0);
+                __m128i _val89ab = __lsx_vld(tmpptr + 8, 0);
+                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
+                _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
+                _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
+                _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
+                _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);
+                _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4);
+                _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5);
+                _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6);
+                _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7);
+                _sum8 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 0), _sum8);
+                _sum9 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 1), _sum9);
+                _suma = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 2), _suma);
+                _sumb = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 3), _sumb);
+
+                tmpptr += 12;
+                kptr0 += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr0 + 4, 0);
+            __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
+            __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
+            __lsx_vst(_sum4, outptr0 + 4 * 4, 0);
+            __lsx_vst(_sum5, outptr0 + 4 * 5, 0);
+            __lsx_vst(_sum6, outptr0 + 4 * 6, 0);
+            __lsx_vst(_sum7, outptr0 + 4 * 7, 0);
+            __lsx_vst(_sum8, outptr0 + 4 * 8, 0);
+            __lsx_vst(_sum9, outptr0 + 4 * 9, 0);
+            __lsx_vst(_suma, outptr0 + 4 * 10, 0);
+            __lsx_vst(_sumb, outptr0 + 4 * 11, 0);
+
+            outptr0 += 4 * 12;
+        }
+        for (; i + 7 < size; i += 8)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
+            const float* kptr0 = kernel.channel(p);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum1 = _sum0;
+            __m128 _sum2 = _sum0;
+            __m128 _sum3 = _sum0;
+            __m128 _sum4 = _sum0;
+            __m128 _sum5 = _sum0;
+            __m128 _sum6 = _sum0;
+            __m128 _sum7 = _sum0;
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 32);
+                __builtin_prefetch(kptr0 + 16);
+                __m128i _val0123 = __lsx_vld(tmpptr, 0);
+                __m128i _val4567 = __lsx_vld(tmpptr + 4, 0);
+                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
+                _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
+                _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
+                _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
+                _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);
+                _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4);
+                _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5);
+                _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6);
+                _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7);
+
+                tmpptr += 8;
+                kptr0 += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr0 + 4, 0);
+            __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
+            __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
+            __lsx_vst(_sum4, outptr0 + 4 * 4, 0);
+            __lsx_vst(_sum5, outptr0 + 4 * 5, 0);
+            __lsx_vst(_sum6, outptr0 + 4 * 6, 0);
+            __lsx_vst(_sum7, outptr0 + 4 * 7, 0);
+
+            outptr0 += 4 * 8;
+        }
+        for (; i + 3 < size; i += 4)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
+            const float* kptr0 = kernel.channel(p);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum1 = _sum0;
+            __m128 _sum2 = _sum0;
+            __m128 _sum3 = _sum0;
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 16);
+                __builtin_prefetch(kptr0 + 16);
+                __m128i _val0123 = __lsx_vld(tmpptr, 0);
+                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
+                _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
+                _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
+                _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
+                _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);
+
+                tmpptr += 4;
+                kptr0 += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr0 + 4, 0);
+            __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
+            __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
+
+            outptr0 += 4 * 4;
+        }
+        for (; i + 1 < size; i += 2)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
+            const float* kptr0 = kernel.channel(p);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum1 = _sum0;
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 8);
+                __builtin_prefetch(kptr0 + 16);
+                __m128 _val0 = __lsx_vreplfr2vr_s(*tmpptr++);
+                __m128 _val1 = __lsx_vreplfr2vr_s(*tmpptr++);
+                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
+                _sum0 = __lsx_vfmadd_s(_w0, _val0, _sum0);
+                _sum1 = __lsx_vfmadd_s(_w0, _val1, _sum1);
+
+                kptr0 += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr0 + 4, 0);
+
+            outptr0 += 4 * 2;
+        }
+        for (; i < size; i++)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
+            const float* kptr0 = kernel.channel(p);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 4);
+                __builtin_prefetch(kptr0 + 16);
+                __m128 _val0 = __lsx_vreplfr2vr_s(*tmpptr++);
+                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
+                _sum = __lsx_vfmadd_s(_w0, _val0, _sum);
+
+                kptr0 += 4;
+            }
+
+            __lsx_vst(_sum, outptr0, 0);
+
+            outptr0 += 4;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator);
+    {
+        const int gap = (w * stride_h - outw * stride_w) * 4;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            float* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const float* sptr = img.row<const float>(dilation_h * u) + dilation_w * v * 4;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j < outw; j++)
+                        {
+                            __m128 _val = (__m128)__lsx_vld(sptr, 0);
+                            __lsx_vst(_val, ptr, 0);
+
+                            sptr += stride_w * 4;
+                            ptr += 4;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack4_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/loongarch/convolution_sgemm_pack4to1.h b/src/layer/loongarch/convolution_sgemm_pack4to1.h
new file mode 100644
index 00000000000..3748645b4d4
--- /dev/null
+++ b/src/layer/loongarch/convolution_sgemm_pack4to1.h
@@ -0,0 +1,667 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_pack4to1_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    const float* bias = _bias;
+
+    Mat tmp;
+    if (size >= 12)
+        tmp.create(12 * maxk, inch, size / 12 + (size % 12) / 8 + (size % 12 % 8) / 4 + size % 12 % 4, 4u * 4, 4, opt.workspace_allocator);
+    else if (size >= 8)
+        tmp.create(8 * maxk, inch, size / 8 + (size % 8) / 4 + size % 4, 4u * 4, 4, opt.workspace_allocator);
+    else if (size >= 4)
+        tmp.create(4 * maxk, inch, size / 4 + size % 4, 4u * 4, 4, opt.workspace_allocator);
+    else
+        tmp.create(maxk, inch, size, 4u * 4, 4, opt.workspace_allocator);
+    {
+        int remain_size_start = 0;
+        int nn_size = size / 12;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 12;
+
+            float* tmpptr = tmp.channel(i / 12);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    // transpose 4x12
+                    __m128i _r0 = __lsx_vld(img0, 0);
+                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
+                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);
+                    __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0);
+                    __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0);
+                    __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0);
+                    __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0);
+                    __m128i _r8 = __lsx_vld(img0 + 4 * 8, 0);
+                    __m128i _r9 = __lsx_vld(img0 + 4 * 9, 0);
+                    __m128i _ra = __lsx_vld(img0 + 4 * 10, 0);
+                    __m128i _rb = __lsx_vld(img0 + 4 * 11, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
+                    __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
+                    __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
+                    __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
+                    __m128i _r89r = __lsx_vilvl_w(_r9, _r8);
+                    __m128i _r89l = __lsx_vilvh_w(_r9, _r8);
+                    __m128i _rabr = __lsx_vilvl_w(_rb, _ra);
+                    __m128i _rabl = __lsx_vilvh_w(_rb, _ra);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+                    __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
+                    __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
+                    __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
+                    __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);
+                    __m128i _r89ab_0 = __lsx_vilvl_d(_rabr, _r89r);
+                    __m128i _r89ab_1 = __lsx_vilvh_d(_rabr, _r89r);
+                    __m128i _r89ab_2 = __lsx_vilvl_d(_rabl, _r89l);
+                    __m128i _r89ab_3 = __lsx_vilvh_d(_rabl, _r89l);
+
+                    __lsx_vst(_r0123_0, tmpptr, 0);
+                    __lsx_vst(_r4567_0, tmpptr + 4, 0);
+                    __lsx_vst(_r89ab_0, tmpptr + 4 * 2, 0);
+                    __lsx_vst(_r0123_1, tmpptr + 4 * 3, 0);
+                    __lsx_vst(_r4567_1, tmpptr + 4 * 4, 0);
+                    __lsx_vst(_r89ab_1, tmpptr + 4 * 5, 0);
+                    __lsx_vst(_r0123_2, tmpptr + 4 * 6, 0);
+                    __lsx_vst(_r4567_2, tmpptr + 4 * 7, 0);
+                    __lsx_vst(_r89ab_2, tmpptr + 4 * 8, 0);
+                    __lsx_vst(_r0123_3, tmpptr + 4 * 9, 0);
+                    __lsx_vst(_r4567_3, tmpptr + 4 * 10, 0);
+                    __lsx_vst(_r89ab_3, tmpptr + 4 * 11, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 48;
+                }
+            }
+        }
+
+        remain_size_start += nn_size * 12;
+        nn_size = (size - remain_size_start) >> 3;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 8;
+
+            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    // transpose 4x8
+                    __m128i _r0 = __lsx_vld(img0, 0);
+                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
+                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);
+                    __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0);
+                    __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0);
+                    __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0);
+                    __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
+                    __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
+                    __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
+                    __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+                    __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
+                    __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
+                    __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
+                    __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);
+
+                    __lsx_vst(_r0123_0, tmpptr, 0);
+                    __lsx_vst(_r4567_0, tmpptr + 4, 0);
+                    __lsx_vst(_r0123_1, tmpptr + 4 * 2, 0);
+                    __lsx_vst(_r4567_1, tmpptr + 4 * 3, 0);
+                    __lsx_vst(_r0123_2, tmpptr + 4 * 4, 0);
+                    __lsx_vst(_r4567_2, tmpptr + 4 * 5, 0);
+                    __lsx_vst(_r0123_3, tmpptr + 4 * 6, 0);
+                    __lsx_vst(_r4567_3, tmpptr + 4 * 7, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 32;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 3;
+        nn_size = (size - remain_size_start) >> 2;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 4;
+
+            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    // transpose 4x4
+                    __m128i _r0 = __lsx_vld(img0, 0);
+                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
+                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                    __lsx_vst(_r0123_0, tmpptr, 0);
+                    __lsx_vst(_r0123_1, tmpptr + 4, 0);
+                    __lsx_vst(_r0123_2, tmpptr + 4 * 2, 0);
+                    __lsx_vst(_r0123_3, tmpptr + 4 * 3, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 16;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 2;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    __m128 _val = (__m128)__lsx_vld(img0, 0);
+                    __lsx_vst(_val, tmpptr, 0);
+
+                    img0 += size * 4;
+                    tmpptr += 4;
+                }
+            }
+        }
+    }
+
+    int nn_outch = outch / 4;
+    int remain_outch_start = nn_outch * 4;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 4;
+
+        float* outptr0 = top_blob.channel(p);
+        float* outptr1 = top_blob.channel(p + 1);
+        float* outptr2 = top_blob.channel(p + 2);
+        float* outptr3 = top_blob.channel(p + 3);
+
+        const float zeros[4] = {0.f};
+        const float* biasptr = bias ? bias + p : zeros;
+
+        int i = 0;
+        for (; i + 11 < size; i += 12)
+        {
+            const float* tmpptr = tmp.channel(i / 12);
+            const float* kptr0 = kernel.channel(p / 4);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128i _bias = __lsx_vld(biasptr, 0);
+            __m128 _sum0 = (__m128)__lsx_vreplvei_w(_bias, 0);
+            __m128 _sum1 = (__m128)__lsx_vreplvei_w(_bias, 0);
+            __m128 _sum2 = (__m128)__lsx_vreplvei_w(_bias, 0);
+            __m128 _sum3 = (__m128)__lsx_vreplvei_w(_bias, 1);
+            __m128 _sum4 = (__m128)__lsx_vreplvei_w(_bias, 1);
+            __m128 _sum5 = (__m128)__lsx_vreplvei_w(_bias, 1);
+            __m128 _sum6 = (__m128)__lsx_vreplvei_w(_bias, 2);
+            __m128 _sum7 = (__m128)__lsx_vreplvei_w(_bias, 2);
+            __m128 _sum8 = (__m128)__lsx_vreplvei_w(_bias, 2);
+            __m128 _sum9 = (__m128)__lsx_vreplvei_w(_bias, 3);
+            __m128 _suma = (__m128)__lsx_vreplvei_w(_bias, 3);
+            __m128 _sumb = (__m128)__lsx_vreplvei_w(_bias, 3);
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 48);
+                __builtin_prefetch(kptr0 + 16);
+                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
+                __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0);
+                __m128 _val2 = (__m128)__lsx_vld(tmpptr + 8, 0);
+                __m128i _w0123 = __lsx_vld(kptr0, 0);
+                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val0, _sum0);
+                _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val1, _sum1);
+                _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val2, _sum2);
+                _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val0, _sum3);
+                _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val1, _sum4);
+                _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val2, _sum5);
+                _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val0, _sum6);
+                _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val1, _sum7);
+                _sum8 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val2, _sum8);
+                _sum9 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val0, _sum9);
+                _suma = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val1, _suma);
+                _sumb = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val2, _sumb);
+
+                tmpptr += 12;
+                kptr0 += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr0 + 4, 0);
+            __lsx_vst(_sum2, outptr0 + 8, 0);
+            __lsx_vst(_sum3, outptr1, 0);
+            __lsx_vst(_sum4, outptr1 + 4, 0);
+            __lsx_vst(_sum5, outptr1 + 8, 0);
+            __lsx_vst(_sum6, outptr2, 0);
+            __lsx_vst(_sum7, outptr2 + 4, 0);
+            __lsx_vst(_sum8, outptr2 + 8, 0);
+            __lsx_vst(_sum9, outptr3, 0);
+            __lsx_vst(_suma, outptr3 + 4, 0);
+            __lsx_vst(_sumb, outptr3 + 8, 0);
+
+            outptr0 += 12;
+            outptr1 += 12;
+            outptr2 += 12;
+            outptr3 += 12;
+        }
+        for (; i + 7 < size; i += 8)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
+            const float* kptr0 = kernel.channel(p / 4);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128i _bias = __lsx_vld(biasptr, 0);
+            __m128 _sum0 = (__m128)__lsx_vreplvei_w(_bias, 0);
+            __m128 _sum1 = (__m128)__lsx_vreplvei_w(_bias, 0);
+            __m128 _sum2 = (__m128)__lsx_vreplvei_w(_bias, 1);
+            __m128 _sum3 = (__m128)__lsx_vreplvei_w(_bias, 1);
+            __m128 _sum4 = (__m128)__lsx_vreplvei_w(_bias, 2);
+            __m128 _sum5 = (__m128)__lsx_vreplvei_w(_bias, 2);
+            __m128 _sum6 = (__m128)__lsx_vreplvei_w(_bias, 3);
+            __m128 _sum7 = (__m128)__lsx_vreplvei_w(_bias, 3);
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 32);
+                __builtin_prefetch(kptr0 + 16);
+                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
+                __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0);
+                __m128i _w0123 = __lsx_vld(kptr0, 0);
+                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val0, _sum0);
+                _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val1, _sum1);
+                _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val0, _sum2);
+                _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val1, _sum3);
+                _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val0, _sum4);
+                _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val1, _sum5);
+                _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val0, _sum6);
+                _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val1, _sum7);
+
+                tmpptr += 8;
+                kptr0 += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr0 + 4, 0);
+            __lsx_vst(_sum2, outptr1, 0);
+            __lsx_vst(_sum3, outptr1 + 4, 0);
+            __lsx_vst(_sum4, outptr2, 0);
+            __lsx_vst(_sum5, outptr2 + 4, 0);
+            __lsx_vst(_sum6, outptr3, 0);
+            __lsx_vst(_sum7, outptr3 + 4, 0);
+
+            outptr0 += 8;
+            outptr1 += 8;
+            outptr2 += 8;
+            outptr3 += 8;
+        }
+        for (; i + 3 < size; i += 4)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
+            const float* kptr0 = kernel.channel(p / 4);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128i _bias = __lsx_vld(biasptr, 0);
+            __m128 _sum0 = (__m128)__lsx_vreplvei_w(_bias, 0);
+            __m128 _sum1 = (__m128)__lsx_vreplvei_w(_bias, 1);
+            __m128 _sum2 = (__m128)__lsx_vreplvei_w(_bias, 2);
+            __m128 _sum3 = (__m128)__lsx_vreplvei_w(_bias, 3);
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 16);
+                __builtin_prefetch(kptr0 + 16);
+                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
+                __m128i _w0123 = __lsx_vld(kptr0, 0);
+                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val0, _sum0);
+                _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val0, _sum1);
+                _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val0, _sum2);
+                _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val0, _sum3);
+
+                tmpptr += 4;
+                kptr0 += 4;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr1, 0);
+            __lsx_vst(_sum2, outptr2, 0);
+            __lsx_vst(_sum3, outptr3, 0);
+
+            outptr0 += 4;
+            outptr1 += 4;
+            outptr2 += 4;
+            outptr3 += 4;
+        }
+        for (; i < size; i++)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4);
+            const float* kptr0 = kernel.channel(p / 4);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum = (__m128)__lsx_vld(biasptr, 0);
+            float* _sum_p = (float*)&_sum;
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 4);
+                __builtin_prefetch(kptr0 + 16);
+                __m128 _val0 = __lsx_vreplfr2vr_s(*tmpptr++);
+                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
+                _sum = __lsx_vfmadd_s(_w0, _val0, _sum);
+
+                kptr0 += 4;
+            }
+
+            outptr0[0] = _sum_p[0];
+            outptr1[0] = _sum_p[1];
+            outptr2[0] = _sum_p[2];
+            outptr3[0] = _sum_p[3];
+
+            outptr0 += 1;
+            outptr1 += 1;
+            outptr2 += 1;
+            outptr3 += 1;
+        }
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        float* outptr0 = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        int i = 0;
+        for (; i + 11 < size; i += 12)
+        {
+            const float* tmpptr = tmp.channel(i / 12);
+            const float* kptr0 = kernel.channel(p / 4 + p % 4);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum0 = __lsx_vreplfr2vr_s(bias0);
+            __m128 _sum1 = __lsx_vreplfr2vr_s(bias0);
+            __m128 _sum2 = __lsx_vreplfr2vr_s(bias0);
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 48);
+                __builtin_prefetch(kptr0 + 4);
+                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
+                __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0);
+                __m128 _val2 = (__m128)__lsx_vld(tmpptr + 8, 0);
+                __m128 _w0 = __lsx_vreplfr2vr_s(*kptr0);
+                _sum0 = __lsx_vfmadd_s(_val0, _w0, _sum0);
+                _sum1 = __lsx_vfmadd_s(_val1, _w0, _sum1);
+                _sum2 = __lsx_vfmadd_s(_val2, _w0, _sum2);
+
+                tmpptr += 12;
+                kptr0 += 1;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr0 + 4, 0);
+            __lsx_vst(_sum2, outptr0 + 8, 0);
+
+            outptr0 += 12;
+        }
+        for (; i + 7 < size; i += 8)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
+            const float* kptr0 = kernel.channel(p / 4 + p % 4);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum0 = __lsx_vreplfr2vr_s(bias0);
+            __m128 _sum1 = __lsx_vreplfr2vr_s(bias0);
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 32);
+                __builtin_prefetch(kptr0 + 4);
+                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
+                __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0);
+                __m128 _w0 = __lsx_vreplfr2vr_s(*kptr0);
+                _sum0 = __lsx_vfmadd_s(_val0, _w0, _sum0);
+                _sum1 = __lsx_vfmadd_s(_val1, _w0, _sum1);
+
+                tmpptr += 8;
+                kptr0 += 1;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+            __lsx_vst(_sum1, outptr0 + 4, 0);
+
+            outptr0 += 8;
+        }
+        for (; i + 3 < size; i += 4)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
+            const float* kptr0 = kernel.channel(p / 4 + p % 4);
+
+            int nn = inch * maxk * 4; // inch always > 0
+
+            __m128 _sum0 = __lsx_vreplfr2vr_s(bias0);
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 16);
+                __builtin_prefetch(kptr0 + 4);
+                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
+                __m128 _w0 = __lsx_vreplfr2vr_s(*kptr0);
+                _sum0 = __lsx_vfmadd_s(_val0, _w0, _sum0);
+
+                tmpptr += 4;
+                kptr0 += 1;
+            }
+
+            __lsx_vst(_sum0, outptr0, 0);
+
+            outptr0 += 4;
+        }
+        for (; i < size; i++)
+        {
+            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4);
+            const float* kptr0 = kernel.channel(p / 4 + p % 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            float sum0 = bias0;
+
+            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+
+            for (int j = 0; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 16);
+                __builtin_prefetch(kptr0 + 16);
+                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
+                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
+                _sum0 = __lsx_vfmadd_s(_w0, _val0, _sum0);
+                tmpptr += 4;
+                kptr0 += 4;
+            }
+
+            sum0 += __lsx_reduce_fadd_s(_sum0);
+
+            outptr0[0] = sum0;
+
+            outptr0 += 1;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // interleave
+    // src = maxk-inch-outch
+    // dst = pb-pa-maxk-inch/pa-outch/pb
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+    kernel_tm.create(4 * 4 * maxk, inch / 4, outch / 4 + outch % 4);
+
+    int q = 0;
+    for (; q + 3 < outch; q += 4)
+    {
+        float* g00 = kernel_tm.channel(q / 4);
+
+        for (int p = 0; p + 3 < inch; p += 4)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 4; j++)
+                    {
+                        const float* k00 = kernel.channel(q + j).row(p + i);
+
+                        g00[0] = k00[k];
+
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+    for (; q < outch; q++)
+    {
+        const Mat k0 = kernel.channel(q);
+
+        float* g00 = kernel_tm.channel(q / 4 + q % 4);
+
+        for (int p = 0; p + 3 < inch; p += 4)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int j = 0; j < 4; j++)
+                {
+                    const float* k00 = k0.row(p + j);
+
+                    g00[0] = k00[k];
+
+                    g00++;
+                }
+            }
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator);
+    {
+        const int gap = (w * stride_h - outw * stride_w) * 4;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            float* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const float* sptr = img.row(dilation_h * u) + dilation_w * v * 4;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j < outw; j++)
+                        {
+                            __m128 _val = (__m128)__lsx_vld(sptr, 0);
+                            __lsx_vst(_val, ptr, 0);
+
+                            sptr += stride_w * 4;
+                            ptr += 4;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack4to1_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
+}
diff --git a/src/layer/loongarch/convolution_sgemm_pack8to1_int8.h b/src/layer/loongarch/convolution_sgemm_pack8to1_int8.h
new file mode 100644
index 00000000000..98d11a574b0
--- /dev/null
+++ b/src/layer/loongarch/convolution_sgemm_pack8to1_int8.h
@@ -0,0 +1,458 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_pack8to1_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    // permute
+    Mat tmp;
+    if (size >= 2)
+        tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator);
+    else
+        tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator);
+    {
+        int remain_size_start = 0;
+        int nn_size = (size - remain_size_start) >> 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 2;
+
+            int64_t* tmpptr = tmp.channel(i / 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    __m128i _v = __lsx_vld(img0, 0);
+                    __lsx_vst(_v, tmpptr, 0);
+                    tmpptr += 2;
+                    img0 += size;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            int64_t* tmpptr = tmp.channel(i / 2 + i % 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr += 1;
+                    img0 += size;
+                }
+            }
+        }
+    }
+
+    int nn_outch = 0;
+    int remain_outch_start = 0;
+
+    nn_outch = outch >> 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 4;
+
+        int* outptr0 = top_blob.channel(p);
+        int* outptr1 = top_blob.channel(p + 1);
+        int* outptr2 = top_blob.channel(p + 2);
+        int* outptr3 = top_blob.channel(p + 3);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p / 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            __m128i _sum00 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum01 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum02 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum03 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum10 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum11 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum12 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum13 = __lsx_vreplgr2vr_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 64);
+                __builtin_prefetch(kptr + 128);
+                __m128i _val01 = __lsx_vld(tmpptr, 0);
+                __m128i _extval01 = __lsx_vslti_b(_val01, 0);
+                __m128i _val0 = __lsx_vilvl_b(_extval01, _val01);
+                __m128i _val1 = __lsx_vilvh_b(_extval01, _val01);
+
+                __m128i _w01 = __lsx_vld(kptr, 0);
+                __m128i _w23 = __lsx_vld(kptr + 16, 0);
+                __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                __m128i _extw23 = __lsx_vslti_b(_w23, 0);
+                __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+                __m128i _w2 = __lsx_vilvl_b(_extw23, _w23);
+                __m128i _w3 = __lsx_vilvh_b(_extw23, _w23);
+
+                __m128i _s00 = __lsx_vmul_h(_val0, _w0);
+                __m128i _s01 = __lsx_vmul_h(_val0, _w1);
+                __m128i _s02 = __lsx_vmul_h(_val0, _w2);
+                __m128i _s03 = __lsx_vmul_h(_val0, _w3);
+                __m128i _s10 = __lsx_vmul_h(_val1, _w0);
+                __m128i _s11 = __lsx_vmul_h(_val1, _w1);
+                __m128i _s12 = __lsx_vmul_h(_val1, _w2);
+                __m128i _s13 = __lsx_vmul_h(_val1, _w3);
+
+                _sum00 = __lsx_vadd_w(_sum00, __lsx_vhaddw_w_h(_s00, _s00));
+                _sum01 = __lsx_vadd_w(_sum01, __lsx_vhaddw_w_h(_s01, _s01));
+                _sum02 = __lsx_vadd_w(_sum02, __lsx_vhaddw_w_h(_s02, _s02));
+                _sum03 = __lsx_vadd_w(_sum03, __lsx_vhaddw_w_h(_s03, _s03));
+                _sum10 = __lsx_vadd_w(_sum10, __lsx_vhaddw_w_h(_s10, _s10));
+                _sum11 = __lsx_vadd_w(_sum11, __lsx_vhaddw_w_h(_s11, _s11));
+                _sum12 = __lsx_vadd_w(_sum12, __lsx_vhaddw_w_h(_s12, _s12));
+                _sum13 = __lsx_vadd_w(_sum13, __lsx_vhaddw_w_h(_s13, _s13));
+
+                tmpptr += 16;
+                kptr += 32;
+            }
+
+            // transpose 4x4
+            {
+                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __lsx_vilvl_w(_sum01, _sum00);
+                _tmp1 = __lsx_vilvl_w(_sum03, _sum02);
+                _tmp2 = __lsx_vilvh_w(_sum01, _sum00);
+                _tmp3 = __lsx_vilvh_w(_sum03, _sum02);
+                _sum00 = __lsx_vilvl_d(_tmp1, _tmp0);
+                _sum01 = __lsx_vilvh_d(_tmp1, _tmp0);
+                _sum02 = __lsx_vilvl_d(_tmp3, _tmp2);
+                _sum03 = __lsx_vilvh_d(_tmp3, _tmp2);
+            }
+            {
+                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __lsx_vilvl_w(_sum11, _sum10);
+                _tmp1 = __lsx_vilvl_w(_sum13, _sum12);
+                _tmp2 = __lsx_vilvh_w(_sum11, _sum10);
+                _tmp3 = __lsx_vilvh_w(_sum13, _sum12);
+                _sum10 = __lsx_vilvl_d(_tmp1, _tmp0);
+                _sum11 = __lsx_vilvh_d(_tmp1, _tmp0);
+                _sum12 = __lsx_vilvl_d(_tmp3, _tmp2);
+                _sum13 = __lsx_vilvh_d(_tmp3, _tmp2);
+            }
+
+            _sum00 = __lsx_vadd_w(_sum00, _sum01);
+            _sum02 = __lsx_vadd_w(_sum02, _sum03);
+            _sum10 = __lsx_vadd_w(_sum10, _sum11);
+            _sum12 = __lsx_vadd_w(_sum12, _sum13);
+
+            _sum00 = __lsx_vadd_w(_sum00, _sum02);
+            _sum10 = __lsx_vadd_w(_sum10, _sum12);
+
+            int sum[8];
+            __lsx_vst(_sum00, sum, 0);
+            __lsx_vst(_sum10, sum + 4, 0);
+
+            outptr0[0] = sum[0];
+            outptr1[0] = sum[1];
+            outptr2[0] = sum[2];
+            outptr3[0] = sum[3];
+            outptr0[1] = sum[4];
+            outptr1[1] = sum[5];
+            outptr2[1] = sum[6];
+            outptr3[1] = sum[7];
+            outptr0 += 2;
+            outptr1 += 2;
+            outptr2 += 2;
+            outptr3 += 2;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p / 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 32);
+                __builtin_prefetch(kptr + 128);
+                __m128i _val = __lsx_vld(tmpptr, 0);
+                __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                __m128i _w01 = __lsx_vld(kptr, 0);
+                __m128i _w23 = __lsx_vld(kptr + 16, 0);
+                __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                __m128i _extw23 = __lsx_vslti_b(_w23, 0);
+                __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+                __m128i _w2 = __lsx_vilvl_b(_extw23, _w23);
+                __m128i _w3 = __lsx_vilvh_b(_extw23, _w23);
+
+                __m128i _s0 = __lsx_vmul_h(_val16, _w0);
+                __m128i _s1 = __lsx_vmul_h(_val16, _w1);
+                __m128i _s2 = __lsx_vmul_h(_val16, _w2);
+                __m128i _s3 = __lsx_vmul_h(_val16, _w3);
+
+                _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0));
+                _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1));
+                _sum2 = __lsx_vadd_w(_sum2, __lsx_vhaddw_w_h(_s2, _s2));
+                _sum3 = __lsx_vadd_w(_sum3, __lsx_vhaddw_w_h(_s3, _s3));
+
+                tmpptr += 8;
+                kptr += 32;
+            }
+
+            // transpose 4x4
+            {
+                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
+                _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
+                _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
+                _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
+                _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
+                _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
+                _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
+                _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
+            }
+
+            _sum0 = __lsx_vadd_w(_sum0, _sum1);
+            _sum2 = __lsx_vadd_w(_sum2, _sum3);
+
+            _sum0 = __lsx_vadd_w(_sum0, _sum2);
+
+            int sum[4];
+            __lsx_vst(_sum0, sum, 0);
+
+            outptr0[0] = sum[0];
+            outptr1[0] = sum[1];
+            outptr2[0] = sum[2];
+            outptr3[0] = sum[3];
+            outptr0 += 1;
+            outptr1 += 1;
+            outptr2 += 1;
+            outptr3 += 1;
+        }
+    }
+
+    remain_outch_start += nn_outch << 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        int* outptr0 = top_blob.channel(p);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p / 4 + p % 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 64);
+                __builtin_prefetch(kptr + 32);
+                __m128i _val01 = __lsx_vld(tmpptr, 0);
+                __m128i _extval01 = __lsx_vslti_b(_val01, 0);
+                __m128i _val0 = __lsx_vilvl_b(_extval01, _val01);
+                __m128i _val1 = __lsx_vilvh_b(_extval01, _val01);
+
+                __m128i _w = __lsx_vld(kptr, 0);
+                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                __m128i _s0 = __lsx_vmul_h(_val0, _w16);
+                __m128i _s1 = __lsx_vmul_h(_val1, _w16);
+
+                _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0));
+                _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1));
+
+                tmpptr += 16;
+                kptr += 8;
+            }
+
+            outptr0[0] = __lsx_reduce_add_w(_sum0);
+            outptr0[1] = __lsx_reduce_add_w(_sum1);
+            outptr0 += 2;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p / 4 + p % 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            __m128i _sum = __lsx_vreplgr2vr_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 32);
+                __builtin_prefetch(kptr + 32);
+                __m128i _val = __lsx_vld(tmpptr, 0);
+                __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                __m128i _w = __lsx_vld(kptr, 0);
+                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                __m128i _s0 = __lsx_vmul_h(_val16, _w16);
+
+                _sum = __lsx_vadd_w(_sum, __lsx_vhaddw_w_h(_s0, _s0));
+
+                tmpptr += 8;
+                kptr += 8;
+            }
+
+            outptr0[0] = __lsx_reduce_add_w(_sum);
+            outptr0 += 1;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // interleave
+    // src = maxk-inch-outch
+    // dst = 8a-4b-maxk-inch/8a-outch/4b
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+    if (outch >= 4)
+        kernel_tm.create(32 * maxk, inch / 8, outch / 4 + outch % 4, (size_t)1u);
+    else
+        kernel_tm.create(8 * maxk, inch / 8, outch, (size_t)1u);
+
+    int q = 0;
+    for (; q + 3 < outch; q += 4)
+    {
+        signed char* g00 = kernel_tm.channel(q / 4);
+
+        for (int p = 0; p + 7 < inch; p += 8)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 8; j++)
+                    {
+                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);
+
+                        g00[0] = k00[k];
+
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+    // TODO unroll 2
+    for (; q < outch; q++)
+    {
+        signed char* g00 = kernel_tm.channel(q / 4 + q % 4);
+
+        for (int p = 0; p + 7 < inch; p += 8)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int j = 0; j < 8; j++)
+                {
+                    const signed char* k00 = kernel.channel(q).row<const signed char>(p + j);
+
+                    g00[0] = k00[k];
+
+                    g00++;
+                }
+            }
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+    {
+        const int gap = w * stride_h - outw * stride_w;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            int64_t* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const int64_t* sptr = img.row<const int64_t>(dilation_h * u) + dilation_w * v;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j < outw; j++)
+                        {
+                            ptr[0] = sptr[0];
+
+                            sptr += stride_w;
+                            ptr += 1;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack8to1_int8_lsx(bottom_im2col, top_blob, kernel, opt);
+}
diff --git a/src/layer/loongarch/convolution_sgemm_pack8to4_int8.h b/src/layer/loongarch/convolution_sgemm_pack8to4_int8.h
new file mode 100644
index 00000000000..ae9090c9560
--- /dev/null
+++ b/src/layer/loongarch/convolution_sgemm_pack8to4_int8.h
@@ -0,0 +1,324 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_pack8to4_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    // permute
+    Mat tmp;
+    if (size >= 2)
+        tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator);
+    else
+        tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator);
+    {
+        int remain_size_start = 0;
+        int nn_size = size >> 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 2;
+
+            int64_t* tmpptr = tmp.channel(i / 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    __m128i _v = __lsx_vld(img0, 0);
+                    __lsx_vst(_v, tmpptr, 0);
+                    tmpptr += 2;
+                    img0 += size;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            int64_t* tmpptr = tmp.channel(i / 2 + i % 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr += 1;
+                    img0 += size;
+                }
+            }
+        }
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr0 = top_blob.channel(p);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p);
+
+            int nn = inch * maxk; // inch always > 0
+
+            __m128i _sum00 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum01 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum02 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum03 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum10 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum11 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum12 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum13 = __lsx_vreplgr2vr_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 64);
+                __builtin_prefetch(kptr + 128);
+                __m128i _val01 = __lsx_vld(tmpptr, 0);
+                __m128i _extval01 = __lsx_vslti_b(_val01, 0);
+                __m128i _val0 = __lsx_vilvl_b(_extval01, _val01);
+                __m128i _val1 = __lsx_vilvh_b(_extval01, _val01);
+
+                __m128i _w01 = __lsx_vld(kptr, 0);
+                __m128i _w23 = __lsx_vld(kptr + 16, 0);
+                __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                __m128i _extw23 = __lsx_vslti_b(_w23, 0);
+                __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+                __m128i _w2 = __lsx_vilvl_b(_extw23, _w23);
+                __m128i _w3 = __lsx_vilvh_b(_extw23, _w23);
+
+                __m128i _s00 = __lsx_vmul_h(_val0, _w0);
+                __m128i _s01 = __lsx_vmul_h(_val0, _w1);
+                __m128i _s02 = __lsx_vmul_h(_val0, _w2);
+                __m128i _s03 = __lsx_vmul_h(_val0, _w3);
+                __m128i _s10 = __lsx_vmul_h(_val1, _w0);
+                __m128i _s11 = __lsx_vmul_h(_val1, _w1);
+                __m128i _s12 = __lsx_vmul_h(_val1, _w2);
+                __m128i _s13 = __lsx_vmul_h(_val1, _w3);
+
+                _sum00 = __lsx_vadd_w(_sum00, __lsx_vhaddw_w_h(_s00, _s00));
+                _sum01 = __lsx_vadd_w(_sum01, __lsx_vhaddw_w_h(_s01, _s01));
+                _sum02 = __lsx_vadd_w(_sum02, __lsx_vhaddw_w_h(_s02, _s02));
+                _sum03 = __lsx_vadd_w(_sum03, __lsx_vhaddw_w_h(_s03, _s03));
+                _sum10 = __lsx_vadd_w(_sum10, __lsx_vhaddw_w_h(_s10, _s10));
+                _sum11 = __lsx_vadd_w(_sum11, __lsx_vhaddw_w_h(_s11, _s11));
+                _sum12 = __lsx_vadd_w(_sum12, __lsx_vhaddw_w_h(_s12, _s12));
+                _sum13 = __lsx_vadd_w(_sum13, __lsx_vhaddw_w_h(_s13, _s13));
+
+                tmpptr += 16;
+                kptr += 32;
+            }
+
+            // transpose 4x4
+            {
+                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __lsx_vilvl_w(_sum01, _sum00);
+                _tmp1 = __lsx_vilvl_w(_sum03, _sum02);
+                _tmp2 = __lsx_vilvh_w(_sum01, _sum00);
+                _tmp3 = __lsx_vilvh_w(_sum03, _sum02);
+                _sum00 = __lsx_vilvl_d(_tmp1, _tmp0);
+                _sum01 = __lsx_vilvh_d(_tmp1, _tmp0);
+                _sum02 = __lsx_vilvl_d(_tmp3, _tmp2);
+                _sum03 = __lsx_vilvh_d(_tmp3, _tmp2);
+            }
+            {
+                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __lsx_vilvl_w(_sum11, _sum10);
+                _tmp1 = __lsx_vilvl_w(_sum13, _sum12);
+                _tmp2 = __lsx_vilvh_w(_sum11, _sum10);
+                _tmp3 = __lsx_vilvh_w(_sum13, _sum12);
+                _sum10 = __lsx_vilvl_d(_tmp1, _tmp0);
+                _sum11 = __lsx_vilvh_d(_tmp1, _tmp0);
+                _sum12 = __lsx_vilvl_d(_tmp3, _tmp2);
+                _sum13 = __lsx_vilvh_d(_tmp3, _tmp2);
+            }
+
+            _sum00 = __lsx_vadd_w(_sum00, _sum01);
+            _sum02 = __lsx_vadd_w(_sum02, _sum03);
+            _sum10 = __lsx_vadd_w(_sum10, _sum11);
+            _sum12 = __lsx_vadd_w(_sum12, _sum13);
+
+            _sum00 = __lsx_vadd_w(_sum00, _sum02);
+            _sum10 = __lsx_vadd_w(_sum10, _sum12);
+
+            __lsx_vst(_sum00, outptr0, 0);
+            __lsx_vst(_sum10, outptr0 + 4, 0);
+            outptr0 += 8;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p);
+
+            int nn = inch * maxk; // inch always > 0
+
+            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                __builtin_prefetch(tmpptr + 32);
+                __builtin_prefetch(kptr + 128);
+                __m128i _val = __lsx_vld(tmpptr, 0);
+                __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                __m128i _w01 = __lsx_vld(kptr, 0);
+                __m128i _w23 = __lsx_vld(kptr + 16, 0);
+                __m128i _extw01 = __lsx_vslti_b(_w01, 0);
+                __m128i _extw23 = __lsx_vslti_b(_w23, 0);
+                __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
+                __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
+                __m128i _w2 = __lsx_vilvl_b(_extw23, _w23);
+                __m128i _w3 = __lsx_vilvh_b(_extw23, _w23);
+
+                __m128i _s0 = __lsx_vmul_h(_val16, _w0);
+                __m128i _s1 = __lsx_vmul_h(_val16, _w1);
+                __m128i _s2 = __lsx_vmul_h(_val16, _w2);
+                __m128i _s3 = __lsx_vmul_h(_val16, _w3);
+
+                _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0));
+                _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1));
+                _sum2 = __lsx_vadd_w(_sum2, __lsx_vhaddw_w_h(_s2, _s2));
+                _sum3 = __lsx_vadd_w(_sum3, __lsx_vhaddw_w_h(_s3, _s3));
+
+                tmpptr += 8;
+                kptr += 32;
+            }
+
+            // transpose 4x4
+            {
+                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
+                _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
+                _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
+                _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
+                _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
+                _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
+                _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
+                _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
+            }
+
+            _sum0 = __lsx_vadd_w(_sum0, _sum1);
+            _sum2 = __lsx_vadd_w(_sum2, _sum3);
+
+            _sum0 = __lsx_vadd_w(_sum0, _sum2);
+
+            __lsx_vst(_sum0, outptr0, 0);
+            outptr0 += 4;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // interleave
+    // src = maxk-inch-outch
+    // dst = 8a-4b-maxk-inch/8a-outch/4b
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+    kernel_tm.create(32 * maxk, inch / 8, outch / 4, (size_t)1u);
+
+    for (int q = 0; q + 3 < outch; q += 4)
+    {
+        signed char* g00 = kernel_tm.channel(q / 4);
+
+        for (int p = 0; p + 7 < inch; p += 8)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 8; j++)
+                    {
+                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);
+
+                        g00[0] = k00[k];
+
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+    {
+        const int gap = w * stride_h - outw * stride_w;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            int64_t* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const int64_t* sptr = img.row<const int64_t>(dilation_h * u) + dilation_w * v;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j < outw; j++)
+                        {
+                            ptr[0] = sptr[0];
+
+                            sptr += stride_w;
+                            ptr += 1;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack8to4_int8_lsx(bottom_im2col, top_blob, kernel, opt);
+}
diff --git a/src/layer/loongarch/convolution_winograd_dot.h b/src/layer/loongarch/convolution_winograd_dot.h
new file mode 100644
index 00000000000..9dbbe495549
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_dot.h
@@ -0,0 +1,495 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_winograd_dot_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
+{
+    // Mat bottom_blob_tm(tiles, 16/36/64, inch, 4u, opt.workspace_allocator);
+
+    const int tiles = bottom_blob_tm.w;
+    const int batch = bottom_blob_tm.h;
+    const int inch = bottom_blob_tm.c;
+
+    // permute
+    Mat bottom_blob_tm2;
+    if (tiles >= 4)
+        bottom_blob_tm2.create(4 * inch, tiles / 4 + tiles % 4, batch, 4u, opt.workspace_allocator);
+    else
+        bottom_blob_tm2.create(1 * inch, tiles, batch, 4u, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int r = 0; r < batch; r++)
+    {
+        Mat tm2 = bottom_blob_tm2.channel(r);
+
+        // tile
+        int i = 0;
+        for (; i + 3 < tiles; i += 4)
+        {
+            float* tmpptr = tm2.row(i / 4);
+
+            const float* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i);
+
+            for (int q = 0; q < inch; q++)
+            {
+#if __loongarch_sx
+                __lsx_vst(__lsx_vld(r0, 0), tmpptr, 0);
+#else
+                tmpptr[0] = r0[0];
+                tmpptr[1] = r0[1];
+                tmpptr[2] = r0[2];
+                tmpptr[3] = r0[3];
+#endif
+
+                r0 += bottom_blob_tm.cstep;
+                tmpptr += 4;
+            }
+        }
+        for (; i < tiles; i++)
+        {
+            float* tmpptr = tm2.row(i / 4 + i % 4);
+
+            const float* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i);
+
+            for (int q = 0; q < inch; q++)
+            {
+                tmpptr[0] = r0[0];
+
+                r0 += bottom_blob_tm.cstep;
+                tmpptr += 1;
+            }
+        }
+    }
+
+    bottom_blob_tm = Mat();
+    // permute end
+
+    top_blob_tm.create(tiles, batch, outch, 4u, opt.workspace_allocator);
+
+#if __loongarch_sx
+    int nn_outch = outch >> 3;
+    int remain_outch_start = nn_outch << 3;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 8;
+
+        float* output0_tm = top_blob_tm.channel(p);
+        float* output1_tm = top_blob_tm.channel(p + 1);
+        float* output2_tm = top_blob_tm.channel(p + 2);
+        float* output3_tm = top_blob_tm.channel(p + 3);
+        float* output4_tm = top_blob_tm.channel(p + 4);
+        float* output5_tm = top_blob_tm.channel(p + 5);
+        float* output6_tm = top_blob_tm.channel(p + 6);
+        float* output7_tm = top_blob_tm.channel(p + 7);
+
+        const Mat kernel0_tm = kernel_tm.channel(p / 8);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 3 < tiles; i += 4)
+            {
+                const float* r0 = bb2.row(i / 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch; // inch always > 0
+
+                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum4 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum5 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum6 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum7 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                int j = 0;
+                for (; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 16);
+                    __builtin_prefetch(k0 + 32);
+                    __m128 _val = (__m128)__lsx_vld(r0, 0);
+                    __m128i _w0123 = __lsx_vld(k0, 0);
+                    __m128i _w4567 = __lsx_vld(k0 + 4, 0);
+                    _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0);
+                    _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1);
+                    _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2);
+                    _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3);
+                    _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 0), _val, _sum4);
+                    _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 1), _val, _sum5);
+                    _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 2), _val, _sum6);
+                    _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 3), _val, _sum7);
+
+                    r0 += 4;
+                    k0 += 8;
+                }
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                __lsx_vst(_sum1, output1_tm, 0);
+                __lsx_vst(_sum2, output2_tm, 0);
+                __lsx_vst(_sum3, output3_tm, 0);
+                __lsx_vst(_sum4, output4_tm, 0);
+                __lsx_vst(_sum5, output5_tm, 0);
+                __lsx_vst(_sum6, output6_tm, 0);
+                __lsx_vst(_sum7, output7_tm, 0);
+
+                output0_tm += 4;
+                output1_tm += 4;
+                output2_tm += 4;
+                output3_tm += 4;
+                output4_tm += 4;
+                output5_tm += 4;
+                output6_tm += 4;
+                output7_tm += 4;
+            }
+            for (; i < tiles; i++)
+            {
+                const float* r0 = bb2.row(i / 4 + i % 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch; // inch always > 0
+
+                float sum0 = 0.f;
+                float sum1 = 0.f;
+                float sum2 = 0.f;
+                float sum3 = 0.f;
+                float sum4 = 0.f;
+                float sum5 = 0.f;
+                float sum6 = 0.f;
+                float sum7 = 0.f;
+
+                int j = 0;
+                for (; j < nn; j++)
+                {
+                    sum0 += r0[0] * k0[0];
+                    sum1 += r0[0] * k0[1];
+                    sum2 += r0[0] * k0[2];
+                    sum3 += r0[0] * k0[3];
+                    sum4 += r0[0] * k0[4];
+                    sum5 += r0[0] * k0[5];
+                    sum6 += r0[0] * k0[6];
+                    sum7 += r0[0] * k0[7];
+
+                    r0 += 1;
+                    k0 += 8;
+                }
+
+                output0_tm[0] = sum0;
+                output1_tm[0] = sum1;
+                output2_tm[0] = sum2;
+                output3_tm[0] = sum3;
+                output4_tm[0] = sum4;
+                output5_tm[0] = sum5;
+                output6_tm[0] = sum6;
+                output7_tm[0] = sum7;
+
+                output0_tm++;
+                output1_tm++;
+                output2_tm++;
+                output3_tm++;
+                output4_tm++;
+                output5_tm++;
+                output6_tm++;
+                output7_tm++;
+            }
+        }
+    }
+
+    nn_outch = (outch - remain_outch_start) >> 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = remain_outch_start + pp * 4;
+
+        float* output0_tm = top_blob_tm.channel(p);
+        float* output1_tm = top_blob_tm.channel(p + 1);
+        float* output2_tm = top_blob_tm.channel(p + 2);
+        float* output3_tm = top_blob_tm.channel(p + 3);
+
+        const Mat kernel0_tm = kernel_tm.channel(p / 8 + (p % 8) / 4);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 3 < tiles; i += 4)
+            {
+                const float* r0 = bb2.row(i / 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch; // inch always > 0
+
+                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                int j = 0;
+                for (; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 16);
+                    __builtin_prefetch(k0 + 16);
+                    __m128 _val = (__m128)__lsx_vld(r0, 0);
+                    __m128i _w0123 = __lsx_vld(k0, 0);
+                    _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0);
+                    _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1);
+                    _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2);
+                    _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3);
+
+                    r0 += 4;
+                    k0 += 4;
+                }
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                __lsx_vst(_sum1, output1_tm, 0);
+                __lsx_vst(_sum2, output2_tm, 0);
+                __lsx_vst(_sum3, output3_tm, 0);
+
+                output0_tm += 4;
+                output1_tm += 4;
+                output2_tm += 4;
+                output3_tm += 4;
+            }
+            for (; i < tiles; i++)
+            {
+                const float* r0 = bb2.row(i / 4 + i % 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch; // inch always > 0
+
+                float sum0 = 0.f;
+                float sum1 = 0.f;
+                float sum2 = 0.f;
+                float sum3 = 0.f;
+
+                int j = 0;
+                for (; j < nn; j++)
+                {
+                    sum0 += r0[0] * k0[0];
+                    sum1 += r0[0] * k0[1];
+                    sum2 += r0[0] * k0[2];
+                    sum3 += r0[0] * k0[3];
+
+                    r0 += 1;
+                    k0 += 4;
+                }
+
+                output0_tm[0] = sum0;
+                output1_tm[0] = sum1;
+                output2_tm[0] = sum2;
+                output3_tm[0] = sum3;
+
+                output0_tm++;
+                output1_tm++;
+                output2_tm++;
+                output3_tm++;
+            }
+        }
+    }
+
+    remain_outch_start += nn_outch << 2;
+#else
+    int nn_outch = outch >> 1;
+    int remain_outch_start = nn_outch << 1;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 2;
+
+        float* output0_tm = top_blob_tm.channel(p);
+        float* output1_tm = top_blob_tm.channel(p + 1);
+
+        const Mat kernel0_tm = kernel_tm.channel(p / 2);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 3 < tiles; i += 4)
+            {
+                const float* r0 = bb2.row(i / 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch; // inch always > 0
+
+                float sum00 = 0.f;
+                float sum01 = 0.f;
+                float sum02 = 0.f;
+                float sum03 = 0.f;
+                float sum10 = 0.f;
+                float sum11 = 0.f;
+                float sum12 = 0.f;
+                float sum13 = 0.f;
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 16);
+                    __builtin_prefetch(k0 + 8);
+                    float w0 = k0[0];
+                    float w1 = k0[1];
+                    sum00 += r0[0] * w0;
+                    sum01 += r0[1] * w0;
+                    sum02 += r0[2] * w0;
+                    sum03 += r0[3] * w0;
+                    sum10 += r0[0] * w1;
+                    sum11 += r0[1] * w1;
+                    sum12 += r0[2] * w1;
+                    sum13 += r0[3] * w1;
+
+                    r0 += 4;
+                    k0 += 2;
+                }
+
+                output0_tm[0] = sum00;
+                output0_tm[1] = sum01;
+                output0_tm[2] = sum02;
+                output0_tm[3] = sum03;
+                output1_tm[0] = sum10;
+                output1_tm[1] = sum11;
+                output1_tm[2] = sum12;
+                output1_tm[3] = sum13;
+
+                output0_tm += 4;
+                output1_tm += 4;
+            }
+            for (; i < tiles; i++)
+            {
+                const float* r0 = bb2.row(i / 4 + i % 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch; // inch always > 0
+
+                float sum00 = 0.f;
+                float sum10 = 0.f;
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 4);
+                    __builtin_prefetch(k0 + 8);
+                    float val0 = r0[0];
+                    sum00 += val0 * k0[0];
+                    sum10 += val0 * k0[1];
+
+                    r0 += 1;
+                    k0 += 2;
+                }
+
+                output0_tm[0] = sum00;
+                output1_tm[0] = sum10;
+                output0_tm++;
+                output1_tm++;
+            }
+        }
+    }
+#endif
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        float* output0_tm = top_blob_tm.channel(p);
+
+#if __loongarch_sx
+        const Mat kernel0_tm = kernel_tm.channel(p / 8 + (p % 8) / 4 + p % 4);
+#else
+        const Mat kernel0_tm = kernel_tm.channel(p / 2 + p % 2);
+#endif
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 3 < tiles; i += 4)
+            {
+                const float* r0 = bb2.row(i / 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch; // inch always > 0
+
+                int j = 0;
+#if __loongarch_sx
+                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                for (; j < nn; j++)
+                {
+                    _sum0 = __lsx_vfmadd_s((__m128)__lsx_vld(r0, 0), __lsx_vreplfr2vr_s(k0[0]), _sum0);
+                    r0 += 4;
+                    k0++;
+                }
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                output0_tm += 4;
+#else  // __loongarch_sx
+                float sum0 = 0.f;
+                float sum1 = 0.f;
+                float sum2 = 0.f;
+                float sum3 = 0.f;
+
+                for (; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 16);
+                    __builtin_prefetch(k0 + 4);
+                    float w0 = k0[0];
+                    sum0 += r0[0] * w0;
+                    sum1 += r0[1] * w0;
+                    sum2 += r0[2] * w0;
+                    sum3 += r0[3] * w0;
+
+                    r0 += 4;
+                    k0++;
+                }
+
+                output0_tm[0] = sum0;
+                output0_tm[1] = sum1;
+                output0_tm[2] = sum2;
+                output0_tm[3] = sum3;
+                output0_tm += 4;
+#endif // __loongarch_sx
+            }
+            for (; i < tiles; i++)
+            {
+                const float* r0 = bb2.row(i / 4 + i % 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch; // inch always > 0
+
+                float sum = 0.f;
+
+                for (int j = 0; j < nn; j++)
+                {
+                    float w0 = k0[0];
+                    float val0 = r0[0];
+                    sum += val0 * w0;
+
+                    r0 += 1;
+                    k0 += 1;
+                }
+
+                output0_tm[0] = sum;
+                output0_tm += 1;
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_dot_int8.h b/src/layer/loongarch/convolution_winograd_dot_int8.h
new file mode 100644
index 00000000000..2ae5ce4f55e
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_dot_int8.h
@@ -0,0 +1,594 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_winograd_dot_int8_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
+{
+    // Mat bottom_blob_tm(tiles, 16/36/64, inch, 2u, 1, opt.workspace_allocator);
+
+    const int tiles = bottom_blob_tm.w;
+    const int batch = bottom_blob_tm.h;
+    const int inch = bottom_blob_tm.c;
+
+    // permute
+    Mat bottom_blob_tm2;
+#if __loongarch_sx
+    if (inch >= 4)
+    {
+        if (tiles >= 2)
+            bottom_blob_tm2.create(inch / 4 + inch % 4, tiles / 2 + tiles % 2, batch, 16u, 8, opt.workspace_allocator);
+        else // if (tiles >= 1)
+            bottom_blob_tm2.create(inch / 4 + inch % 4, tiles, batch, 8u, 4, opt.workspace_allocator);
+    }
+    else
+#endif // __loongarch_sx
+    {
+        if (tiles >= 2)
+            bottom_blob_tm2.create(inch, tiles / 2 + tiles % 2, batch, 4u, 2, opt.workspace_allocator);
+        else // if (tiles >= 1)
+            bottom_blob_tm2.create(inch, tiles, batch, 2u, 1, opt.workspace_allocator);
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int r = 0; r < batch; r++)
+    {
+        Mat tm2 = bottom_blob_tm2.channel(r);
+
+        // tile
+        int i = 0;
+        for (; i + 1 < tiles; i += 2)
+        {
+            short* tmpptr = tm2.row<short>(i / 2);
+
+            const short* r0 = (const short*)bottom_blob_tm + r * tiles + i;
+
+            int q = 0;
+#if __loongarch_sx
+            const short* r1 = (const short*)bottom_blob_tm.channel(1) + r * tiles + i;
+            const short* r2 = (const short*)bottom_blob_tm.channel(2) + r * tiles + i;
+            const short* r3 = (const short*)bottom_blob_tm.channel(3) + r * tiles + i;
+            for (; q + 3 < inch; q += 4)
+            {
+                tmpptr[0] = r0[0];
+                tmpptr[1] = r1[0];
+                tmpptr[2] = r2[0];
+                tmpptr[3] = r3[0];
+                tmpptr[4] = r0[1];
+                tmpptr[5] = r1[1];
+                tmpptr[6] = r2[1];
+                tmpptr[7] = r3[1];
+                r0 += bottom_blob_tm.cstep * 4;
+                r1 += bottom_blob_tm.cstep * 4;
+                r2 += bottom_blob_tm.cstep * 4;
+                r3 += bottom_blob_tm.cstep * 4;
+                tmpptr += 8;
+            }
+#endif // __loongarch_sx
+            for (; q < inch; q++)
+            {
+                tmpptr[0] = r0[0];
+                tmpptr[1] = r0[1];
+                r0 += bottom_blob_tm.cstep;
+                tmpptr += 2;
+            }
+        }
+        for (; i < tiles; i++)
+        {
+            short* tmpptr = tm2.row<short>(i / 2 + i % 2);
+
+            const short* r0 = (const short*)bottom_blob_tm + r * tiles + i;
+
+            int q = 0;
+#if __loongarch_sx
+            const short* r1 = (const short*)bottom_blob_tm.channel(1) + r * tiles + i;
+            const short* r2 = (const short*)bottom_blob_tm.channel(2) + r * tiles + i;
+            const short* r3 = (const short*)bottom_blob_tm.channel(3) + r * tiles + i;
+            for (; q + 3 < inch; q += 4)
+            {
+                tmpptr[0] = r0[0];
+                tmpptr[1] = r1[0];
+                tmpptr[2] = r2[0];
+                tmpptr[3] = r3[0];
+                r0 += bottom_blob_tm.cstep * 4;
+                r1 += bottom_blob_tm.cstep * 4;
+                r2 += bottom_blob_tm.cstep * 4;
+                r3 += bottom_blob_tm.cstep * 4;
+                tmpptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; q < inch; q++)
+            {
+                tmpptr[0] = r0[0];
+                r0 += bottom_blob_tm.cstep;
+                tmpptr += 1;
+            }
+        }
+    }
+
+    bottom_blob_tm = Mat();
+    // permute end
+
+    top_blob_tm.create(tiles, batch, outch, 4u, 1, opt.workspace_allocator);
+
+#if __loongarch_sx
+    int nn_outch = outch >> 2;
+    int remain_outch_start = nn_outch << 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 4;
+
+        int* output0_tm = top_blob_tm.channel(p);
+        int* output1_tm = top_blob_tm.channel(p + 1);
+        int* output2_tm = top_blob_tm.channel(p + 2);
+        int* output3_tm = top_blob_tm.channel(p + 3);
+
+        const Mat kernel0_tm = kernel_tm.channel(p / 4);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 1 < tiles; i += 2)
+            {
+                const short* r0 = bb2.row<const short>(i / 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int nn4 = inch / 4;
+                int nn1 = inch % 4;
+
+                __m128i _sum00 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum10 = __lsx_vreplgr2vr_w(0);
+
+                if (nn4 > 0)
+                {
+                    __m128i _sum01 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum02 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum03 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum11 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum12 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum13 = __lsx_vreplgr2vr_w(0);
+
+                    int j = 0;
+                    for (; j < nn4; j++)
+                    {
+                        __m128i _val01 = __lsx_vld(r0, 0);
+
+                        __m128i _val0 = __lsx_vilvl_d(_val01, _val01);
+                        __m128i _val1 = __lsx_vilvh_d(_val01, _val01);
+
+                        __m128i _w0 = __lsx_vld(k0, 0);
+                        __m128i _w1 = __lsx_vld(k0 + 8, 0);
+
+                        __m128i _extval0 = __lsx_vslti_h(_val0, 0);
+                        __m128i _extval1 = __lsx_vslti_h(_val1, 0);
+                        __m128i _extw0 = __lsx_vslti_h(_w0, 0);
+                        __m128i _extw1 = __lsx_vslti_h(_w1, 0);
+
+                        __m128i _val0l = __lsx_vilvl_h(_extval0, _val0);
+                        __m128i _val0h = __lsx_vilvh_h(_extval0, _val0);
+                        __m128i _val1l = __lsx_vilvl_h(_extval1, _val1);
+                        __m128i _val1h = __lsx_vilvh_h(_extval1, _val1);
+
+                        __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
+                        __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
+                        __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
+                        __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);
+
+                        _sum00 = __lsx_vmadd_w(_sum00, _val0l, _w0l);
+                        _sum01 = __lsx_vmadd_w(_sum01, _val0h, _w0h);
+                        _sum02 = __lsx_vmadd_w(_sum02, _val0l, _w1l);
+                        _sum03 = __lsx_vmadd_w(_sum03, _val0h, _w1h);
+                        _sum10 = __lsx_vmadd_w(_sum10, _val1l, _w0l);
+                        _sum11 = __lsx_vmadd_w(_sum11, _val1h, _w0h);
+                        _sum12 = __lsx_vmadd_w(_sum12, _val1l, _w1l);
+                        _sum13 = __lsx_vmadd_w(_sum13, _val1h, _w1h);
+
+                        r0 += 8;
+                        k0 += 16;
+                    }
+
+                    // transpose 4x4
+                    {
+                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = __lsx_vilvl_w(_sum01, _sum00);
+                        _tmp1 = __lsx_vilvl_w(_sum03, _sum02);
+                        _tmp2 = __lsx_vilvh_w(_sum01, _sum00);
+                        _tmp3 = __lsx_vilvh_w(_sum03, _sum02);
+                        _sum00 = __lsx_vilvl_d(_tmp1, _tmp0);
+                        _sum01 = __lsx_vilvh_d(_tmp1, _tmp0);
+                        _sum02 = __lsx_vilvl_d(_tmp3, _tmp2);
+                        _sum03 = __lsx_vilvh_d(_tmp3, _tmp2);
+                    }
+                    {
+                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = __lsx_vilvl_w(_sum11, _sum10);
+                        _tmp1 = __lsx_vilvl_w(_sum13, _sum12);
+                        _tmp2 = __lsx_vilvh_w(_sum11, _sum10);
+                        _tmp3 = __lsx_vilvh_w(_sum13, _sum12);
+                        _sum10 = __lsx_vilvl_d(_tmp1, _tmp0);
+                        _sum11 = __lsx_vilvh_d(_tmp1, _tmp0);
+                        _sum12 = __lsx_vilvl_d(_tmp3, _tmp2);
+                        _sum13 = __lsx_vilvh_d(_tmp3, _tmp2);
+                    }
+
+                    _sum00 = __lsx_vadd_w(_sum00, _sum01);
+                    _sum02 = __lsx_vadd_w(_sum02, _sum03);
+                    _sum10 = __lsx_vadd_w(_sum10, _sum11);
+                    _sum12 = __lsx_vadd_w(_sum12, _sum13);
+
+                    _sum00 = __lsx_vadd_w(_sum00, _sum02);
+                    _sum10 = __lsx_vadd_w(_sum10, _sum12);
+                }
+
+                for (int j = 0; j < nn1; j++)
+                {
+                    __m128i _val0 = __lsx_vreplgr2vr_h(r0[0]);
+                    __m128i _val1 = __lsx_vreplgr2vr_h(r0[1]);
+                    __m128i _val = __lsx_vilvl_d(_val1, _val0);
+
+                    __m128i _w16 = __lsx_vld(k0, 0);
+
+                    _w16 = __lsx_vilvl_d(_w16, _w16);
+
+                    __m128i _extval = __lsx_vslti_h(_val, 0);
+                    __m128i _extw16 = __lsx_vslti_h(_w16, 0);
+
+                    __m128i _vall = __lsx_vilvl_h(_extval, _val);
+                    __m128i _valh = __lsx_vilvh_h(_extval, _val);
+                    __m128i _w0l = __lsx_vilvl_h(_extw16, _w16);
+                    __m128i _w0h = __lsx_vilvh_h(_extw16, _w16);
+
+                    _sum00 = __lsx_vmadd_w(_sum00, _vall, _w0l);
+                    _sum10 = __lsx_vmadd_w(_sum10, _valh, _w0h);
+
+                    r0 += 2;
+                    k0 += 4;
+                }
+
+                int sum[8];
+                __lsx_vst(_sum00, sum, 0);
+                __lsx_vst(_sum10, sum + 4, 0);
+
+                output0_tm[0] = sum[0];
+                output1_tm[0] = sum[1];
+                output2_tm[0] = sum[2];
+                output3_tm[0] = sum[3];
+                output0_tm[1] = sum[4];
+                output1_tm[1] = sum[5];
+                output2_tm[1] = sum[6];
+                output3_tm[1] = sum[7];
+                output0_tm += 2;
+                output1_tm += 2;
+                output2_tm += 2;
+                output3_tm += 2;
+            }
+            for (; i < tiles; i++)
+            {
+                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int nn4 = inch / 4;
+                int nn1 = inch % 4;
+
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+
+                if (nn4 > 0)
+                {
+                    __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+                    int j = 0;
+                    for (; j < nn4; j++)
+                    {
+                        __m128i _val16 = __lsx_vld(r0, 0);
+
+                        _val16 = __lsx_vilvl_d(_val16, _val16);
+
+                        __m128i _w0 = __lsx_vld(k0, 0);
+                        __m128i _w1 = __lsx_vld(k0 + 8, 0);
+
+                        __m128i _extval16 = __lsx_vslti_h(_val16, 0);
+                        __m128i _extw0 = __lsx_vslti_h(_w0, 0);
+                        __m128i _extw1 = __lsx_vslti_h(_w1, 0);
+
+                        __m128i _val0l = __lsx_vilvl_h(_extval16, _val16);
+                        __m128i _val0h = __lsx_vilvh_h(_extval16, _val16);
+
+                        __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
+                        __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
+                        __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
+                        __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);
+
+                        _sum0 = __lsx_vmadd_w(_sum0, _val0l, _w0l);
+                        _sum1 = __lsx_vmadd_w(_sum1, _val0h, _w0h);
+                        _sum2 = __lsx_vmadd_w(_sum2, _val0l, _w1l);
+                        _sum3 = __lsx_vmadd_w(_sum3, _val0h, _w1h);
+
+                        r0 += 4;
+                        k0 += 16;
+                    }
+
+                    // transpose 4x4
+                    {
+                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+                        _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
+                        _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
+                        _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
+                        _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
+                        _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
+                        _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
+                        _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
+                        _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
+                    }
+
+                    _sum0 = __lsx_vadd_w(_sum0, _sum1);
+                    _sum2 = __lsx_vadd_w(_sum2, _sum3);
+                    _sum0 = __lsx_vadd_w(_sum0, _sum2);
+                }
+
+                for (int j = 0; j < nn1; j++)
+                {
+                    __m128i _val = __lsx_vreplgr2vr_w(r0[0]);
+                    __m128i _w16 = __lsx_vld(k0, 0);
+
+                    __m128i _extw16 = __lsx_vslti_h(_w16, 0);
+                    __m128i _w0l = __lsx_vilvl_h(_extw16, _w16);
+
+                    _sum0 = __lsx_vmadd_w(_sum0, _val, _w0l);
+
+                    r0 += 1;
+                    k0 += 4;
+                }
+
+                int sum[4];
+                __lsx_vst(_sum0, sum, 0);
+
+                output0_tm[0] = sum[0];
+                output1_tm[0] = sum[1];
+                output2_tm[0] = sum[2];
+                output3_tm[0] = sum[3];
+                output0_tm += 1;
+                output1_tm += 1;
+                output2_tm += 1;
+                output3_tm += 1;
+            }
+        }
+    }
+#else // __loongarch_sx
+    int nn_outch = outch >> 1;
+    int remain_outch_start = nn_outch << 1;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 2;
+
+        int* output0_tm = top_blob_tm.channel(p);
+        int* output1_tm = top_blob_tm.channel(p + 1);
+
+        const Mat kernel0_tm = kernel_tm.channel(p / 2);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 1 < tiles; i += 2)
+            {
+                const short* r0 = bb2.row<const short>(i / 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int sum00 = 0;
+                int sum01 = 0;
+                int sum10 = 0;
+                int sum11 = 0;
+
+                int nn1 = inch;
+
+                for (int j = 0; j < nn1; j++)
+                {
+                    signed short val0 = r0[0];
+                    signed short val1 = r0[1];
+                    signed short w0 = k0[0];
+                    signed short w1 = k0[1];
+
+                    sum00 += val0 * w0;
+                    sum01 += val1 * w0;
+                    sum10 += val0 * w1;
+                    sum11 += val1 * w1;
+
+                    r0 += 2;
+                    k0 += 2;
+                }
+
+                output0_tm[0] = sum00;
+                output0_tm[1] = sum01;
+                output1_tm[0] = sum10;
+                output1_tm[1] = sum11;
+                output0_tm += 2;
+                output1_tm += 2;
+            }
+            for (; i < tiles; i++)
+            {
+                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int sum0 = 0;
+                int sum1 = 0;
+
+                int nn1 = inch;
+
+                for (int j = 0; j < nn1; j++)
+                {
+                    signed short val0 = r0[0];
+                    signed short w0 = k0[0];
+                    signed short w1 = k0[1];
+
+                    sum0 += val0 * w0;
+                    sum1 += val0 * w1;
+
+                    r0 += 1;
+                    k0 += 2;
+                }
+
+                output0_tm[0] = sum0;
+                output1_tm[0] = sum1;
+                output0_tm += 1;
+                output1_tm += 1;
+            }
+        }
+    }
+#endif // __loongarch_sx
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        int* output0_tm = top_blob_tm.channel(p);
+
+#if __loongarch_sx
+        const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4);
+#else
+        const Mat kernel0_tm = kernel_tm.channel(p / 2 + p % 2);
+#endif
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 1 < tiles; i += 2)
+            {
+                const short* r0 = bb2.row<const short>(i / 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int sum0 = 0;
+                int sum1 = 0;
+
+#if __loongarch_sx
+                int nn4 = inch / 4;
+                int nn1 = inch % 4;
+
+                if (nn4 > 0)
+                {
+                    __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+                    int j = 0;
+                    for (; j < nn4; j++)
+                    {
+                        __m128i _val16 = __lsx_vld(r0, 0);
+
+                        __m128i _w16 = __lsx_vld(k0, 0);
+
+                        _w16 = __lsx_vilvl_d(_w16, _w16);
+
+                        __m128i _extval16 = __lsx_vslti_h(_val16, 0);
+                        __m128i _extw16 = __lsx_vslti_h(_w16, 0);
+
+                        __m128i _val0l = __lsx_vilvl_h(_extval16, _val16);
+                        __m128i _val0h = __lsx_vilvh_h(_extval16, _val16);
+
+                        __m128i _w0l = __lsx_vilvl_h(_extw16, _w16);
+                        __m128i _w0h = __lsx_vilvh_h(_extw16, _w16);
+
+                        _sum0 = __lsx_vmadd_w(_sum0, _val0l, _w0l);
+                        _sum1 = __lsx_vmadd_w(_sum1, _val0h, _w0h);
+
+                        r0 += 8;
+                        k0 += 4;
+                    }
+
+                    sum0 = __lsx_reduce_add_w(_sum0);
+                    sum1 = __lsx_reduce_add_w(_sum1);
+                }
+#else  // __loongarch_sx
+                int nn1 = inch;
+#endif // __loongarch_sx
+
+                for (int q = 0; q < nn1; q++)
+                {
+                    signed short val0 = r0[0];
+                    signed short val1 = r0[1];
+                    signed short w = k0[0];
+
+                    sum0 += val0 * w;
+                    sum1 += val1 * w;
+
+                    k0 += 1;
+                    r0 += 2;
+                }
+
+                output0_tm[0] = sum0;
+                output0_tm[1] = sum1;
+                output0_tm += 2;
+            }
+            for (; i < tiles; i++)
+            {
+                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int sum = 0;
+
+#if __loongarch_sx
+                int nn4 = inch / 4;
+                int nn1 = inch % 4;
+
+                if (nn4 > 0)
+                {
+                    __m128i _sum = __lsx_vreplgr2vr_w(0);
+
+                    int j = 0;
+                    for (; j < nn4; j++)
+                    {
+                        __m128i _val16 = __lsx_vld(r0, 0);
+                        __m128i _w16 = __lsx_vld(k0, 0);
+
+                        __m128i _extval16 = __lsx_vslti_h(_val16, 0);
+                        __m128i _extw16 = __lsx_vslti_h(_w16, 0);
+
+                        __m128i _val0l = __lsx_vilvl_h(_extval16, _val16);
+                        __m128i _w0l = __lsx_vilvl_h(_extw16, _w16);
+
+                        _sum = __lsx_vmadd_w(_sum, _val0l, _w0l);
+
+                        r0 += 4;
+                        k0 += 4;
+                    }
+
+                    sum = __lsx_reduce_add_w(_sum);
+                }
+#else  // __loongarch_sx
+                int nn1 = inch;
+#endif // __loongarch_sx
+
+                for (int q = 0; q < nn1; q++)
+                {
+                    signed short val = r0[0];
+                    signed short w = k0[0];
+
+                    sum += val * w;
+
+                    k0 += 1;
+                    r0 += 1;
+                }
+
+                output0_tm[0] = sum;
+                output0_tm++;
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_dot_pack4.h b/src/layer/loongarch/convolution_winograd_dot_pack4.h
new file mode 100644
index 00000000000..66002a62a62
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_dot_pack4.h
@@ -0,0 +1,448 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_winograd_dot_pack4_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
+{
+    // Mat bottom_blob_tm(tiles, 16/36/64, inch, 16u, 4, opt.workspace_allocator);
+
+    const int tiles = bottom_blob_tm.w;
+    const int batch = bottom_blob_tm.h;
+    const int inch = bottom_blob_tm.c;
+
+    // permute
+    Mat bottom_blob_tm2;
+    if (tiles >= 12)
+        bottom_blob_tm2.create(12 * inch, tiles / 12 + (tiles % 12) / 8 + (tiles % 12 % 8) / 4 + (tiles % 12 % 4) / 2 + tiles % 12 % 2, batch, 16u, 4, opt.workspace_allocator);
+    else if (tiles >= 8)
+        bottom_blob_tm2.create(8 * inch, tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2, batch, 16u, 4, opt.workspace_allocator);
+    else if (tiles >= 4)
+        bottom_blob_tm2.create(4 * inch, tiles / 4 + (tiles % 4) / 2 + tiles % 2, batch, 16u, 4, opt.workspace_allocator);
+    else if (tiles >= 2)
+        bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, batch, 16u, 4, opt.workspace_allocator);
+    else // if (tiles >= 1)
+        bottom_blob_tm2.create(1 * inch, tiles, batch, 16u, 4, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int r = 0; r < batch; r++)
+    {
+        Mat tm2 = bottom_blob_tm2.channel(r);
+
+        // tile
+        int i = 0;
+        for (; i + 11 < tiles; i += 12)
+        {
+            float* tmpptr = tm2.row(i / 12);
+
+            const float* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 4;
+
+            for (int q = 0; q < inch; q++)
+            {
+                // transpose 4x8
+                __m128i _r0 = __lsx_vld(r0, 0);
+                __m128i _r1 = __lsx_vld(r0 + 4, 0);
+                __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0);
+                __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0);
+                __m128i _r4 = __lsx_vld(r0 + 4 * 4, 0);
+                __m128i _r5 = __lsx_vld(r0 + 4 * 5, 0);
+                __m128i _r6 = __lsx_vld(r0 + 4 * 6, 0);
+                __m128i _r7 = __lsx_vld(r0 + 4 * 7, 0);
+                __m128i _r8 = __lsx_vld(r0 + 4 * 8, 0);
+                __m128i _r9 = __lsx_vld(r0 + 4 * 9, 0);
+                __m128i _ra = __lsx_vld(r0 + 4 * 10, 0);
+                __m128i _rb = __lsx_vld(r0 + 4 * 11, 0);
+
+                __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
+                __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
+                __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
+                __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
+                __m128i _r89r = __lsx_vilvl_w(_r9, _r8);
+                __m128i _r89l = __lsx_vilvh_w(_r9, _r8);
+                __m128i _rabr = __lsx_vilvl_w(_rb, _ra);
+                __m128i _rabl = __lsx_vilvh_w(_rb, _ra);
+                __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+                __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
+                __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
+                __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
+                __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);
+                __m128i _r89ab_0 = __lsx_vilvl_d(_rabr, _r89r);
+                __m128i _r89ab_1 = __lsx_vilvh_d(_rabr, _r89r);
+                __m128i _r89ab_2 = __lsx_vilvl_d(_rabl, _r89l);
+                __m128i _r89ab_3 = __lsx_vilvh_d(_rabl, _r89l);
+
+                __lsx_vst(_r0123_0, tmpptr, 0);
+                __lsx_vst(_r4567_0, tmpptr + 4, 0);
+                __lsx_vst(_r89ab_0, tmpptr + 4 * 2, 0);
+                __lsx_vst(_r0123_1, tmpptr + 4 * 3, 0);
+                __lsx_vst(_r4567_1, tmpptr + 4 * 4, 0);
+                __lsx_vst(_r89ab_1, tmpptr + 4 * 5, 0);
+                __lsx_vst(_r0123_2, tmpptr + 4 * 6, 0);
+                __lsx_vst(_r4567_2, tmpptr + 4 * 7, 0);
+                __lsx_vst(_r89ab_2, tmpptr + 4 * 8, 0);
+                __lsx_vst(_r0123_3, tmpptr + 4 * 9, 0);
+                __lsx_vst(_r4567_3, tmpptr + 4 * 10, 0);
+                __lsx_vst(_r89ab_3, tmpptr + 4 * 11, 0);
+
+                r0 += bottom_blob_tm.cstep * 4;
+                tmpptr += 48;
+            }
+        }
+        for (; i + 7 < tiles; i += 8)
+        {
+            float* tmpptr = tm2.row(i / 12 + (i % 12) / 8);
+
+            const float* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 4;
+
+            for (int q = 0; q < inch; q++)
+            {
+                // transpose 4x8
+                __m128i _r0 = __lsx_vld(r0, 0);
+                __m128i _r1 = __lsx_vld(r0 + 4, 0);
+                __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0);
+                __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0);
+                __m128i _r4 = __lsx_vld(r0 + 4 * 4, 0);
+                __m128i _r5 = __lsx_vld(r0 + 4 * 5, 0);
+                __m128i _r6 = __lsx_vld(r0 + 4 * 6, 0);
+                __m128i _r7 = __lsx_vld(r0 + 4 * 7, 0);
+
+                __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
+                __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
+                __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
+                __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
+                __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+                __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
+                __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
+                __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
+                __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);
+
+                __lsx_vst(_r0123_0, tmpptr, 0);
+                __lsx_vst(_r4567_0, tmpptr + 4, 0);
+                __lsx_vst(_r0123_1, tmpptr + 4 * 2, 0);
+                __lsx_vst(_r4567_1, tmpptr + 4 * 3, 0);
+                __lsx_vst(_r0123_2, tmpptr + 4 * 4, 0);
+                __lsx_vst(_r4567_2, tmpptr + 4 * 5, 0);
+                __lsx_vst(_r0123_3, tmpptr + 4 * 6, 0);
+                __lsx_vst(_r4567_3, tmpptr + 4 * 7, 0);
+
+                r0 += bottom_blob_tm.cstep * 4;
+                tmpptr += 32;
+            }
+        }
+        for (; i + 3 < tiles; i += 4)
+        {
+            float* tmpptr = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
+
+            const float* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 4;
+
+            for (int q = 0; q < inch; q++)
+            {
+                // transpose 4x4
+                __m128i _r0 = __lsx_vld(r0, 0);
+                __m128i _r1 = __lsx_vld(r0 + 4, 0);
+                __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0);
+                __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0);
+
+                __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                __lsx_vst(_r0123_0, tmpptr, 0);
+                __lsx_vst(_r0123_1, tmpptr + 4, 0);
+                __lsx_vst(_r0123_2, tmpptr + 4 * 2, 0);
+                __lsx_vst(_r0123_3, tmpptr + 4 * 3, 0);
+
+                r0 += bottom_blob_tm.cstep * 4;
+                tmpptr += 16;
+            }
+        }
+        for (; i + 1 < tiles; i += 2)
+        {
+            float* tmpptr = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
+
+            const float* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 4;
+
+            for (int q = 0; q < inch; q++)
+            {
+                // transpose 4x2
+                __m128i _r0 = __lsx_vld(r0, 0);
+                __m128i _r1 = __lsx_vld(r0 + 4, 0);
+
+                __m128i _r01_0 = __lsx_vilvl_w(_r1, _r0);
+                __m128i _r01_1 = __lsx_vilvh_w(_r1, _r0);
+
+                __lsx_vst(_r01_0, tmpptr, 0);
+                __lsx_vst(_r01_1, tmpptr + 4, 0);
+
+                r0 += bottom_blob_tm.cstep * 4;
+                tmpptr += 8;
+            }
+        }
+        for (; i < tiles; i++)
+        {
+            float* tmpptr = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
+
+            const float* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 4;
+
+            for (int q = 0; q < inch; q++)
+            {
+                __m128i _val = __lsx_vld(r0, 0);
+                __lsx_vst(_val, tmpptr, 0);
+
+                r0 += bottom_blob_tm.cstep * 4;
+                tmpptr += 4;
+            }
+        }
+    }
+
+    bottom_blob_tm = Mat();
+    // permute end
+
+    top_blob_tm.create(tiles, batch, outch, 16u, 4, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        float* output0_tm = top_blob_tm.channel(p);
+
+        const Mat kernel0_tm = kernel_tm.channel(p);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 11 < tiles; i += 12)
+            {
+                const float* r0 = bb2.row(i / 12);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch * 4; // inch always > 0
+
+                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum4 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum5 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum6 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum7 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum8 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum9 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _suma = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sumb = (__m128)__lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 48);
+                    __builtin_prefetch(k0 + 16);
+                    __m128i _val0123 = __lsx_vld(r0, 0);
+                    __m128i _val4567 = __lsx_vld(r0 + 4, 0);
+                    __m128i _val89ab = __lsx_vld(r0 + 8, 0);
+                    __m128 _w0 = (__m128)__lsx_vld(k0, 0);
+                    _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
+                    _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
+                    _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
+                    _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);
+                    _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4);
+                    _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5);
+                    _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6);
+                    _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7);
+                    _sum8 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 0), _sum8);
+                    _sum9 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 1), _sum9);
+                    _suma = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 2), _suma);
+                    _sumb = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 3), _sumb);
+
+                    r0 += 12;
+                    k0 += 4;
+                }
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                __lsx_vst(_sum1, output0_tm + 4, 0);
+                __lsx_vst(_sum2, output0_tm + 4 * 2, 0);
+                __lsx_vst(_sum3, output0_tm + 4 * 3, 0);
+                __lsx_vst(_sum4, output0_tm + 4 * 4, 0);
+                __lsx_vst(_sum5, output0_tm + 4 * 5, 0);
+                __lsx_vst(_sum6, output0_tm + 4 * 6, 0);
+                __lsx_vst(_sum7, output0_tm + 4 * 7, 0);
+                __lsx_vst(_sum8, output0_tm + 4 * 8, 0);
+                __lsx_vst(_sum9, output0_tm + 4 * 9, 0);
+                __lsx_vst(_suma, output0_tm + 4 * 10, 0);
+                __lsx_vst(_sumb, output0_tm + 4 * 11, 0);
+
+                output0_tm += 4 * 12;
+            }
+            for (; i + 7 < tiles; i += 8)
+            {
+                const float* r0 = bb2.row(i / 12 + (i % 12) / 8);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch * 4; // inch always > 0
+
+                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum4 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum5 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum6 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum7 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 32);
+                    __builtin_prefetch(k0 + 16);
+                    __m128i _val0123 = __lsx_vld(r0, 0);
+                    __m128i _val4567 = __lsx_vld(r0 + 4, 0);
+                    __m128 _w0 = (__m128)__lsx_vld(k0, 0);
+                    _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
+                    _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
+                    _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
+                    _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);
+                    _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4);
+                    _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5);
+                    _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6);
+                    _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7);
+
+                    r0 += 8;
+                    k0 += 4;
+                }
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                __lsx_vst(_sum1, output0_tm + 4, 0);
+                __lsx_vst(_sum2, output0_tm + 4 * 2, 0);
+                __lsx_vst(_sum3, output0_tm + 4 * 3, 0);
+                __lsx_vst(_sum4, output0_tm + 4 * 4, 0);
+                __lsx_vst(_sum5, output0_tm + 4 * 5, 0);
+                __lsx_vst(_sum6, output0_tm + 4 * 6, 0);
+                __lsx_vst(_sum7, output0_tm + 4 * 7, 0);
+
+                output0_tm += 4 * 8;
+            }
+            for (; i + 3 < tiles; i += 4)
+            {
+                const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch * 4; // inch always > 0
+
+                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 16);
+                    __builtin_prefetch(k0 + 16);
+                    __m128i _val0123 = __lsx_vld(r0, 0);
+                    __m128 _w0 = (__m128)__lsx_vld(k0, 0);
+                    _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
+                    _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
+                    _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
+                    _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);
+
+                    r0 += 4;
+                    k0 += 4;
+                }
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                __lsx_vst(_sum1, output0_tm + 4, 0);
+                __lsx_vst(_sum2, output0_tm + 4 * 2, 0);
+                __lsx_vst(_sum3, output0_tm + 4 * 3, 0);
+
+                output0_tm += 4 * 4;
+            }
+            for (; i + 1 < tiles; i += 2)
+            {
+                const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch * 4; // inch always > 0
+
+                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 8);
+                    __builtin_prefetch(k0 + 16);
+                    __m128 _val0 = __lsx_vreplfr2vr_s(*r0++);
+                    __m128 _val1 = __lsx_vreplfr2vr_s(*r0++);
+                    __m128 _w0 = (__m128)__lsx_vld(k0, 0);
+                    _sum0 = __lsx_vfmadd_s(_w0, _val0, _sum0);
+                    _sum1 = __lsx_vfmadd_s(_w0, _val1, _sum1);
+
+                    k0 += 4;
+                }
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                __lsx_vst(_sum1, output0_tm + 4, 0);
+
+                output0_tm += 4 * 2;
+            }
+            for (; i < tiles; i++)
+            {
+                const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
+                const float* k0 = kernel0_tm.row(r);
+
+                int nn = inch * 4; // inch always > 0
+
+                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 4);
+                    __builtin_prefetch(k0 + 16);
+                    __m128 _val0 = __lsx_vreplfr2vr_s(*r0++);
+                    __m128 _w0 = (__m128)__lsx_vld(k0, 0);
+                    _sum = __lsx_vfmadd_s(_w0, _val0, _sum);
+
+                    k0 += 4;
+                }
+
+                __lsx_vst(_sum, output0_tm, 0);
+
+                output0_tm += 4;
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_dot_pack8to1_int8.h b/src/layer/loongarch/convolution_winograd_dot_pack8to1_int8.h
new file mode 100644
index 00000000000..f87aa9ef558
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_dot_pack8to1_int8.h
@@ -0,0 +1,363 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_winograd_dot_pack8to1_int8_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
+{
+    // Mat bottom_blob_tm(tiles, 16/36/64, inch, 16u, 8, opt.workspace_allocator);
+
+    const int tiles = bottom_blob_tm.w;
+    const int batch = bottom_blob_tm.h;
+    const int inch = bottom_blob_tm.c;
+
+    // permute
+    Mat bottom_blob_tm2;
+    if (tiles >= 2)
+        bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, batch, 16u, 8, opt.workspace_allocator);
+    else // if (tiles >= 1)
+        bottom_blob_tm2.create(1 * inch, tiles, batch, 16u, 8, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int r = 0; r < batch; r++)
+    {
+        Mat tm2 = bottom_blob_tm2.channel(r);
+
+        // tile
+        int i = 0;
+        for (; i + 1 < tiles; i += 2)
+        {
+            short* tmpptr = tm2.row<short>(i / 2);
+
+            const short* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 8;
+
+            for (int q = 0; q < inch; q++)
+            {
+                __m128i _r0 = __lsx_vld(r0, 0);
+                __m128i _r1 = __lsx_vld(r0 + 8, 0);
+                __lsx_vst(_r0, tmpptr, 0);
+                __lsx_vst(_r1, tmpptr + 8, 0);
+                r0 += bottom_blob_tm.cstep * 8;
+                tmpptr += 16;
+            }
+        }
+        for (; i < tiles; i++)
+        {
+            short* tmpptr = tm2.row<short>(i / 2 + i % 2);
+
+            const short* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 8;
+
+            for (int q = 0; q < inch; q++)
+            {
+                __m128i _r0 = __lsx_vld(r0, 0);
+                __lsx_vst(_r0, tmpptr, 0);
+                r0 += bottom_blob_tm.cstep * 8;
+                tmpptr += 8;
+            }
+        }
+    }
+
+    bottom_blob_tm = Mat();
+    // permute end
+
+    top_blob_tm.create(tiles, batch, outch, 4u, 1, opt.workspace_allocator);
+
+    int nn_outch = 0;
+    int remain_outch_start = 0;
+
+    nn_outch = outch >> 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 4;
+
+        int* output0_tm = top_blob_tm.channel(p);
+        int* output1_tm = top_blob_tm.channel(p + 1);
+        int* output2_tm = top_blob_tm.channel(p + 2);
+        int* output3_tm = top_blob_tm.channel(p + 3);
+
+        const Mat kernel0_tm = kernel_tm.channel(p / 4);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 1 < tiles; i += 2)
+            {
+                const short* r0 = bb2.row<const short>(i / 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int nn = inch; // inch always > 0
+
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 64);
+                    __builtin_prefetch(k0 + 128);
+                    __m128i _w0 = __lsx_vld(k0, 0);
+                    __m128i _w1 = __lsx_vld(k0 + 8, 0);
+                    __m128i _w2 = __lsx_vld(k0 + 16, 0);
+                    __m128i _w3 = __lsx_vld(k0 + 24, 0);
+
+                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
+                    __m128i _extw1 = __lsx_vslti_h(_w1, 0);
+                    __m128i _extw2 = __lsx_vslti_h(_w2, 0);
+                    __m128i _extw3 = __lsx_vslti_h(_w3, 0);
+
+                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
+                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
+                    __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
+                    __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);
+                    __m128i _w2l = __lsx_vilvl_h(_extw2, _w2);
+                    __m128i _w2h = __lsx_vilvh_h(_extw2, _w2);
+                    __m128i _w3l = __lsx_vilvl_h(_extw3, _w3);
+                    __m128i _w3h = __lsx_vilvh_h(_extw3, _w3);
+
+                    __m128i _val0_0 = __lsx_vreplgr2vr_w(r0[0]);
+                    __m128i _val0_1 = __lsx_vreplgr2vr_w(r0[1]);
+                    __m128i _val0_2 = __lsx_vreplgr2vr_w(r0[2]);
+                    __m128i _val0_3 = __lsx_vreplgr2vr_w(r0[3]);
+                    __m128i _val0_4 = __lsx_vreplgr2vr_w(r0[4]);
+                    __m128i _val0_5 = __lsx_vreplgr2vr_w(r0[5]);
+                    __m128i _val0_6 = __lsx_vreplgr2vr_w(r0[6]);
+                    __m128i _val0_7 = __lsx_vreplgr2vr_w(r0[7]);
+                    __m128i _val1_0 = __lsx_vreplgr2vr_w(r0[8]);
+                    __m128i _val1_1 = __lsx_vreplgr2vr_w(r0[9]);
+                    __m128i _val1_2 = __lsx_vreplgr2vr_w(r0[10]);
+                    __m128i _val1_3 = __lsx_vreplgr2vr_w(r0[11]);
+                    __m128i _val1_4 = __lsx_vreplgr2vr_w(r0[12]);
+                    __m128i _val1_5 = __lsx_vreplgr2vr_w(r0[13]);
+                    __m128i _val1_6 = __lsx_vreplgr2vr_w(r0[14]);
+                    __m128i _val1_7 = __lsx_vreplgr2vr_w(r0[15]);
+
+                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0_0);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val0_1);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w0l, _val1_0);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w0h, _val1_1);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val0_2);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val0_3);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w1l, _val1_2);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w1h, _val1_3);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val0_4);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val0_5);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w2l, _val1_4);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w2h, _val1_5);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val0_6);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val0_7);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w3l, _val1_6);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w3h, _val1_7);
+
+                    r0 += 16;
+                    k0 += 32;
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+                _sum2 = __lsx_vadd_w(_sum2, _sum3);
+
+                int sum[8];
+                __lsx_vst(_sum0, sum, 0);
+                __lsx_vst(_sum2, sum + 4, 0);
+
+                output0_tm[0] = sum[0];
+                output1_tm[0] = sum[1];
+                output2_tm[0] = sum[2];
+                output3_tm[0] = sum[3];
+                output0_tm[1] = sum[4];
+                output1_tm[1] = sum[5];
+                output2_tm[1] = sum[6];
+                output3_tm[1] = sum[7];
+                output0_tm += 2;
+                output1_tm += 2;
+                output2_tm += 2;
+                output3_tm += 2;
+            }
+            for (; i < tiles; i++)
+            {
+                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int nn = inch; // inch always > 0
+
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 32);
+                    __builtin_prefetch(k0 + 128);
+                    __m128i _w0 = __lsx_vld(k0, 0);
+                    __m128i _w1 = __lsx_vld(k0 + 8, 0);
+                    __m128i _w2 = __lsx_vld(k0 + 16, 0);
+                    __m128i _w3 = __lsx_vld(k0 + 24, 0);
+
+                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
+                    __m128i _extw1 = __lsx_vslti_h(_w1, 0);
+                    __m128i _extw2 = __lsx_vslti_h(_w2, 0);
+                    __m128i _extw3 = __lsx_vslti_h(_w3, 0);
+
+                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
+                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
+                    __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
+                    __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);
+                    __m128i _w2l = __lsx_vilvl_h(_extw2, _w2);
+                    __m128i _w2h = __lsx_vilvh_h(_extw2, _w2);
+                    __m128i _w3l = __lsx_vilvl_h(_extw3, _w3);
+                    __m128i _w3h = __lsx_vilvh_h(_extw3, _w3);
+
+                    __m128i _val0 = __lsx_vreplgr2vr_w(r0[0]);
+                    __m128i _val1 = __lsx_vreplgr2vr_w(r0[1]);
+                    __m128i _val2 = __lsx_vreplgr2vr_w(r0[2]);
+                    __m128i _val3 = __lsx_vreplgr2vr_w(r0[3]);
+                    __m128i _val4 = __lsx_vreplgr2vr_w(r0[4]);
+                    __m128i _val5 = __lsx_vreplgr2vr_w(r0[5]);
+                    __m128i _val6 = __lsx_vreplgr2vr_w(r0[6]);
+                    __m128i _val7 = __lsx_vreplgr2vr_w(r0[7]);
+
+                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val1);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val2);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val3);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val4);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val5);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val6);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val7);
+
+                    r0 += 8;
+                    k0 += 32;
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+
+                int sum[4];
+                __lsx_vst(_sum0, sum, 0);
+
+                output0_tm[0] = sum[0];
+                output1_tm[0] = sum[1];
+                output2_tm[0] = sum[2];
+                output3_tm[0] = sum[3];
+                output0_tm += 1;
+                output1_tm += 1;
+                output2_tm += 1;
+                output3_tm += 1;
+            }
+        }
+    }
+
+    remain_outch_start += nn_outch << 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        int* output0_tm = top_blob_tm.channel(p);
+
+        const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 1 < tiles; i += 2)
+            {
+                const short* r0 = bb2.row<const short>(i / 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+                for (int q = 0; q < inch; q++)
+                {
+                    __builtin_prefetch(r0 + 32);
+                    __builtin_prefetch(k0 + 64);
+                    __m128i _val0 = __lsx_vld(r0, 0);
+                    __m128i _val1 = __lsx_vld(r0 + 8, 0);
+
+                    __m128i _extval0 = __lsx_vslti_h(_val0, 0);
+                    __m128i _extval1 = __lsx_vslti_h(_val1, 0);
+                    __m128i _val0l = __lsx_vilvl_h(_extval0, _val0);
+                    __m128i _val0h = __lsx_vilvh_h(_extval0, _val0);
+                    __m128i _val1l = __lsx_vilvl_h(_extval1, _val1);
+                    __m128i _val1h = __lsx_vilvh_h(_extval1, _val1);
+
+                    __m128i _w0 = __lsx_vld(k0, 0);
+
+                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
+                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
+                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
+
+                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0l);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val0h);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w0l, _val1l);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w0h, _val1h);
+
+                    k0 += 8;
+                    r0 += 16;
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+                _sum2 = __lsx_vadd_w(_sum2, _sum3);
+
+                output0_tm[0] = __lsx_reduce_add_w(_sum0);
+                output0_tm[1] = __lsx_reduce_add_w(_sum2);
+                output0_tm += 2;
+            }
+            for (; i < tiles; i++)
+            {
+                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+                for (int q = 0; q < inch; q++)
+                {
+                    __builtin_prefetch(r0 + 32);
+                    __builtin_prefetch(k0 + 32);
+                    __m128i _val = __lsx_vld(r0, 0);
+
+                    __m128i _extval = __lsx_vslti_h(_val, 0);
+                    __m128i _vall = __lsx_vilvl_h(_extval, _val);
+                    __m128i _valh = __lsx_vilvh_h(_extval, _val);
+
+                    __m128i _w0 = __lsx_vld(k0, 0);
+
+                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
+                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
+                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
+
+                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _vall);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _valh);
+
+                    k0 += 8;
+                    r0 += 8;
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+
+                output0_tm[0] = __lsx_reduce_add_w(_sum0);
+                output0_tm++;
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_dot_pack8to4_int8.h b/src/layer/loongarch/convolution_winograd_dot_pack8to4_int8.h
new file mode 100644
index 00000000000..c20400cbf8c
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_dot_pack8to4_int8.h
@@ -0,0 +1,233 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_winograd_dot_pack8to4_int8_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
+{
+    // Mat bottom_blob_tm(tiles, 16/36/64, inch, 16u, 8, opt.workspace_allocator);
+
+    const int tiles = bottom_blob_tm.w;
+    const int batch = bottom_blob_tm.h;
+    const int inch = bottom_blob_tm.c;
+
+    // permute
+    Mat bottom_blob_tm2;
+    if (tiles >= 2)
+        bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, batch, 16u, 8, opt.workspace_allocator);
+    else // if (tiles >= 1)
+        bottom_blob_tm2.create(1 * inch, tiles, batch, 16u, 8, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int r = 0; r < batch; r++)
+    {
+        Mat tm2 = bottom_blob_tm2.channel(r);
+
+        // tile
+        int i = 0;
+        for (; i + 1 < tiles; i += 2)
+        {
+            short* tmpptr = tm2.row<short>(i / 2);
+
+            const short* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 8;
+
+            for (int q = 0; q < inch; q++)
+            {
+                __m128i _r0 = __lsx_vld(r0, 0);
+                __m128i _r1 = __lsx_vld(r0 + 8, 0);
+                __lsx_vst(_r0, tmpptr, 0);
+                __lsx_vst(_r1, tmpptr + 8, 0);
+                r0 += bottom_blob_tm.cstep * 8;
+                tmpptr += 16;
+            }
+        }
+        for (; i < tiles; i++)
+        {
+            short* tmpptr = tm2.row<short>(i / 2 + i % 2);
+
+            const short* r0 = bottom_blob_tm;
+
+            r0 += (r * tiles + i) * 8;
+
+            for (int q = 0; q < inch; q++)
+            {
+                __m128i _r0 = __lsx_vld(r0, 0);
+                __lsx_vst(_r0, tmpptr, 0);
+                r0 += bottom_blob_tm.cstep * 8;
+                tmpptr += 8;
+            }
+        }
+    }
+
+    bottom_blob_tm = Mat();
+    // permute end
+
+    top_blob_tm.create(tiles, batch, outch, 16u, 4, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* output0_tm = top_blob_tm.channel(p);
+
+        const Mat kernel0_tm = kernel_tm.channel(p);
+
+        for (int r = 0; r < batch; r++)
+        {
+            const Mat bb2 = bottom_blob_tm2.channel(r);
+
+            int i = 0;
+            for (; i + 1 < tiles; i += 2)
+            {
+                const short* r0 = bb2.row<const short>(i / 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int nn = inch; // inch always > 0
+
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum3 = __lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 64);
+                    __builtin_prefetch(k0 + 128);
+                    __m128i _w0 = __lsx_vld(k0, 0);
+                    __m128i _w1 = __lsx_vld(k0 + 8, 0);
+                    __m128i _w2 = __lsx_vld(k0 + 16, 0);
+                    __m128i _w3 = __lsx_vld(k0 + 24, 0);
+
+                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
+                    __m128i _extw1 = __lsx_vslti_h(_w1, 0);
+                    __m128i _extw2 = __lsx_vslti_h(_w2, 0);
+                    __m128i _extw3 = __lsx_vslti_h(_w3, 0);
+
+                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
+                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
+                    __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
+                    __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);
+                    __m128i _w2l = __lsx_vilvl_h(_extw2, _w2);
+                    __m128i _w2h = __lsx_vilvh_h(_extw2, _w2);
+                    __m128i _w3l = __lsx_vilvl_h(_extw3, _w3);
+                    __m128i _w3h = __lsx_vilvh_h(_extw3, _w3);
+
+                    __m128i _val0_0 = __lsx_vreplgr2vr_w(r0[0]);
+                    __m128i _val0_1 = __lsx_vreplgr2vr_w(r0[1]);
+                    __m128i _val0_2 = __lsx_vreplgr2vr_w(r0[2]);
+                    __m128i _val0_3 = __lsx_vreplgr2vr_w(r0[3]);
+                    __m128i _val0_4 = __lsx_vreplgr2vr_w(r0[4]);
+                    __m128i _val0_5 = __lsx_vreplgr2vr_w(r0[5]);
+                    __m128i _val0_6 = __lsx_vreplgr2vr_w(r0[6]);
+                    __m128i _val0_7 = __lsx_vreplgr2vr_w(r0[7]);
+                    __m128i _val1_0 = __lsx_vreplgr2vr_w(r0[8]);
+                    __m128i _val1_1 = __lsx_vreplgr2vr_w(r0[9]);
+                    __m128i _val1_2 = __lsx_vreplgr2vr_w(r0[10]);
+                    __m128i _val1_3 = __lsx_vreplgr2vr_w(r0[11]);
+                    __m128i _val1_4 = __lsx_vreplgr2vr_w(r0[12]);
+                    __m128i _val1_5 = __lsx_vreplgr2vr_w(r0[13]);
+                    __m128i _val1_6 = __lsx_vreplgr2vr_w(r0[14]);
+                    __m128i _val1_7 = __lsx_vreplgr2vr_w(r0[15]);
+
+                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0_0);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val0_1);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w0l, _val1_0);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w0h, _val1_1);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val0_2);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val0_3);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w1l, _val1_2);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w1h, _val1_3);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val0_4);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val0_5);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w2l, _val1_4);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w2h, _val1_5);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val0_6);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val0_7);
+                    _sum2 = __lsx_vmadd_w(_sum2, _w3l, _val1_6);
+                    _sum3 = __lsx_vmadd_w(_sum3, _w3h, _val1_7);
+
+                    r0 += 16;
+                    k0 += 32;
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+                _sum2 = __lsx_vadd_w(_sum2, _sum3);
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                __lsx_vst(_sum2, output0_tm + 4, 0);
+
+                output0_tm += 8;
+            }
+            for (; i < tiles; i++)
+            {
+                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
+                const short* k0 = kernel0_tm.row<const short>(r);
+
+                int nn = inch; // inch always > 0
+
+                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+                for (int j = 0; j < nn; j++)
+                {
+                    __builtin_prefetch(r0 + 32);
+                    __builtin_prefetch(k0 + 128);
+                    __m128i _w0 = __lsx_vld(k0, 0);
+                    __m128i _w1 = __lsx_vld(k0 + 8, 0);
+                    __m128i _w2 = __lsx_vld(k0 + 16, 0);
+                    __m128i _w3 = __lsx_vld(k0 + 24, 0);
+
+                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
+                    __m128i _extw1 = __lsx_vslti_h(_w1, 0);
+                    __m128i _extw2 = __lsx_vslti_h(_w2, 0);
+                    __m128i _extw3 = __lsx_vslti_h(_w3, 0);
+
+                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
+                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
+                    __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
+                    __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);
+                    __m128i _w2l = __lsx_vilvl_h(_extw2, _w2);
+                    __m128i _w2h = __lsx_vilvh_h(_extw2, _w2);
+                    __m128i _w3l = __lsx_vilvl_h(_extw3, _w3);
+                    __m128i _w3h = __lsx_vilvh_h(_extw3, _w3);
+
+                    __m128i _val0 = __lsx_vreplgr2vr_w(r0[0]);
+                    __m128i _val1 = __lsx_vreplgr2vr_w(r0[1]);
+                    __m128i _val2 = __lsx_vreplgr2vr_w(r0[2]);
+                    __m128i _val3 = __lsx_vreplgr2vr_w(r0[3]);
+                    __m128i _val4 = __lsx_vreplgr2vr_w(r0[4]);
+                    __m128i _val5 = __lsx_vreplgr2vr_w(r0[5]);
+                    __m128i _val6 = __lsx_vreplgr2vr_w(r0[6]);
+                    __m128i _val7 = __lsx_vreplgr2vr_w(r0[7]);
+
+                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val1);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val2);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val3);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val4);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val5);
+                    _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val6);
+                    _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val7);
+
+                    r0 += 8;
+                    k0 += 32;
+                }
+
+                _sum0 = __lsx_vadd_w(_sum0, _sum1);
+
+                __lsx_vst(_sum0, output0_tm, 0);
+                output0_tm += 4;
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_transform.h b/src/layer/loongarch/convolution_winograd_transform.h
new file mode 100644
index 00000000000..624600e95a0
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_transform.h
@@ -0,0 +1,405 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd43_transform_input_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
+{
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int inch = bottom_blob.c;
+
+    const int w_tiles = (w - 2) / 4;
+    const int h_tiles = (h - 2) / 4;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float itm[6][6] = {
+    //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
+    //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
+    //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
+    //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
+    //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
+    //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
+    // };
+
+    // 0 =  4 * r00 - 5 * r02 + r04
+    // 1 = -4 * (r01 + r02) + r04 + r03
+    // 2 =  4 * (r01 - r02) + r04 - r03
+    // 3 = -2 * (r01 - r03) + r04 - r02
+    // 4 =  2 * (r01 - r03) + r04 - r02
+    // 5 =  4 * r01 - 5 * r03 + r05
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < inch; q++)
+    {
+        const Mat img0 = bottom_blob.channel(q);
+        Mat img0_tm = bottom_blob_tm.channel(q);
+
+        float tmp[6][6];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* r0 = img0.row(i * 4) + (j * 4);
+
+                for (int m = 0; m < 6; m++)
+                {
+                    float r00 = r0[0];
+                    float r01 = r0[1];
+                    float r02 = r0[2];
+                    float r03 = r0[3];
+                    float r04 = r0[4];
+                    float r05 = r0[5];
+
+                    float tmp0m = 4 * r00 - 5 * r02 + r04;
+                    float tmp1m = -4 * (r01 + r02) + r04 + r03;
+                    float tmp2m = 4 * (r01 - r02) + r04 - r03;
+                    float tmp3m = -2 * (r01 - r03) + r04 - r02;
+                    float tmp4m = 2 * (r01 - r03) + r04 - r02;
+                    float tmp5m = 4 * r01 - 5 * r03 + r05;
+
+                    tmp[0][m] = tmp0m;
+                    tmp[1][m] = tmp1m;
+                    tmp[2][m] = tmp2m;
+                    tmp[3][m] = tmp3m;
+                    tmp[4][m] = tmp4m;
+                    tmp[5][m] = tmp5m;
+
+                    r0 += w;
+                }
+
+                float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j);
+                float* r0_tm_1 = r0_tm_0 + tiles;
+                float* r0_tm_2 = r0_tm_0 + tiles * 2;
+                float* r0_tm_3 = r0_tm_0 + tiles * 3;
+                float* r0_tm_4 = r0_tm_0 + tiles * 4;
+                float* r0_tm_5 = r0_tm_0 + tiles * 5;
+
+                for (int m = 0; m < 6; m++)
+                {
+                    float tmp00 = tmp[m][0];
+                    float tmp01 = tmp[m][1];
+                    float tmp02 = tmp[m][2];
+                    float tmp03 = tmp[m][3];
+                    float tmp04 = tmp[m][4];
+                    float tmp05 = tmp[m][5];
+
+                    float r0tm0 = 4 * tmp00 - 5 * tmp02 + tmp04;
+                    float r0tm1 = -4 * (tmp01 + tmp02) + tmp04 + tmp03;
+                    float r0tm2 = 4 * (tmp01 - tmp02) + tmp04 - tmp03;
+                    float r0tm3 = -2 * (tmp01 - tmp03) + tmp04 - tmp02;
+                    float r0tm4 = 2 * (tmp01 - tmp03) + tmp04 - tmp02;
+                    float r0tm5 = 4 * tmp01 - 5 * tmp03 + tmp05;
+
+                    r0_tm_0[0] = r0tm0;
+                    r0_tm_1[0] = r0tm1;
+                    r0_tm_2[0] = r0tm2;
+                    r0_tm_3[0] = r0tm3;
+                    r0_tm_4[0] = r0tm4;
+                    r0_tm_5[0] = r0tm5;
+
+                    r0_tm_0 += tiles * 6;
+                    r0_tm_1 += tiles * 6;
+                    r0_tm_2 += tiles * 6;
+                    r0_tm_3 += tiles * 6;
+                    r0_tm_4 += tiles * 6;
+                    r0_tm_5 += tiles * 6;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_transform_output_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
+{
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int outch = top_blob.c;
+
+    const int w_tiles = outw / 4;
+    const int h_tiles = outh / 4;
+    const int tiles = w_tiles * h_tiles;
+
+    const float* biasptr = bias;
+
+    // const float otm[4][6] = {
+    //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
+    //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
+    //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
+    //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
+    // };
+
+    // 0 = r00 + (r01 + r02) + (r03 + r04)
+    // 1 =       (r01 - r02) + (r03 - r04) * 2
+    // 2 =       (r01 + r02) + (r03 + r04) * 4
+    // 3 = r05 + (r01 - r02) + (r03 - r04) * 8
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        const Mat out0_tm = top_blob_tm.channel(p);
+        Mat out0 = top_blob.channel(p);
+
+        float bias0 = biasptr ? biasptr[p] : 0.f;
+
+        float tmp[4][6];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j);
+                const float* output0_tm_1 = output0_tm_0 + tiles;
+                const float* output0_tm_2 = output0_tm_0 + tiles * 2;
+                const float* output0_tm_3 = output0_tm_0 + tiles * 3;
+                const float* output0_tm_4 = output0_tm_0 + tiles * 4;
+                const float* output0_tm_5 = output0_tm_0 + tiles * 5;
+
+                float* output0 = out0.row(i * 4) + (j * 4);
+
+                for (int m = 0; m < 6; m++)
+                {
+                    float out0tm0 = output0_tm_0[0];
+                    float out0tm1 = output0_tm_1[0];
+                    float out0tm2 = output0_tm_2[0];
+                    float out0tm3 = output0_tm_3[0];
+                    float out0tm4 = output0_tm_4[0];
+                    float out0tm5 = output0_tm_5[0];
+
+                    float tmp02a = out0tm1 + out0tm2;
+                    float tmp13a = out0tm1 - out0tm2;
+
+                    float tmp02b = out0tm3 + out0tm4;
+                    float tmp13b = out0tm3 - out0tm4;
+
+                    float tmp0m = out0tm0 + tmp02a + tmp02b;
+                    float tmp1m = tmp13a + tmp13b * 2;
+                    float tmp2m = tmp02a + tmp02b * 4;
+                    float tmp3m = out0tm5 + tmp13a + tmp13b * 8;
+
+                    tmp[0][m] = tmp0m;
+                    tmp[1][m] = tmp1m;
+                    tmp[2][m] = tmp2m;
+                    tmp[3][m] = tmp3m;
+
+                    output0_tm_0 += tiles * 6;
+                    output0_tm_1 += tiles * 6;
+                    output0_tm_2 += tiles * 6;
+                    output0_tm_3 += tiles * 6;
+                    output0_tm_4 += tiles * 6;
+                    output0_tm_5 += tiles * 6;
+                }
+
+                for (int m = 0; m < 4; m++)
+                {
+                    float tmp00 = tmp[m][0];
+                    float tmp01 = tmp[m][1];
+                    float tmp02 = tmp[m][2];
+                    float tmp03 = tmp[m][3];
+                    float tmp04 = tmp[m][4];
+                    float tmp05 = tmp[m][5];
+
+                    float tmp02a = tmp01 + tmp02;
+                    float tmp13a = tmp01 - tmp02;
+
+                    float tmp02b = tmp03 + tmp04;
+                    float tmp13b = tmp03 - tmp04;
+
+                    float out00 = bias0 + tmp00 + tmp02a + tmp02b;
+                    float out01 = bias0 + tmp13a + tmp13b * 2;
+                    float out02 = bias0 + tmp02a + tmp02b * 4;
+                    float out03 = bias0 + tmp05 + tmp13a + tmp13b * 8;
+
+                    output0[0] = out00;
+                    output0[1] = out01;
+                    output0[2] = out02;
+                    output0[3] = out03;
+
+                    output0 += outw;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_transform_input_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
+{
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int inch = bottom_blob.c;
+
+    const int w_tiles = (w - 2) / 2;
+    const int h_tiles = (h - 2) / 2;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float itm[4][4] = {
+    //     {1.0f,  0.0f, -1.0f,  0.0f},
+    //     {0.0f,  1.0f,  1.00f, 0.0f},
+    //     {0.0f, -1.0f,  1.00f, 0.0f},
+    //     {0.0f, -1.0f,  0.00f, 1.0f}
+    // };
+
+    // 0 = r00 - r02
+    // 1 = r01 + r02
+    // 2 = r02 - r01
+    // 3 = r03 - r01
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < inch; q++)
+    {
+        const Mat img0 = bottom_blob.channel(q);
+        Mat img0_tm = bottom_blob_tm.channel(q);
+
+        float tmp[4][4];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* r0 = img0.row(i * 2) + (j * 2);
+
+                for (int m = 0; m < 4; m++)
+                {
+                    float r00 = r0[0];
+                    float r01 = r0[1];
+                    float r02 = r0[2];
+                    float r03 = r0[3];
+
+                    float tmp0m = r00 - r02;
+                    float tmp1m = r01 + r02;
+                    float tmp2m = r02 - r01;
+                    float tmp3m = r03 - r01;
+
+                    tmp[0][m] = tmp0m;
+                    tmp[1][m] = tmp1m;
+                    tmp[2][m] = tmp2m;
+                    tmp[3][m] = tmp3m;
+
+                    r0 += w;
+                }
+
+                float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j);
+                float* r0_tm_1 = r0_tm_0 + tiles;
+                float* r0_tm_2 = r0_tm_0 + tiles * 2;
+                float* r0_tm_3 = r0_tm_0 + tiles * 3;
+
+                for (int m = 0; m < 4; m++)
+                {
+                    float tmp00 = tmp[m][0];
+                    float tmp01 = tmp[m][1];
+                    float tmp02 = tmp[m][2];
+                    float tmp03 = tmp[m][3];
+
+                    float r0tm0 = tmp00 - tmp02;
+                    float r0tm1 = tmp01 + tmp02;
+                    float r0tm2 = tmp02 - tmp01;
+                    float r0tm3 = tmp03 - tmp01;
+
+                    r0_tm_0[0] = r0tm0;
+                    r0_tm_1[0] = r0tm1;
+                    r0_tm_2[0] = r0tm2;
+                    r0_tm_3[0] = r0tm3;
+
+                    r0_tm_0 += tiles * 4;
+                    r0_tm_1 += tiles * 4;
+                    r0_tm_2 += tiles * 4;
+                    r0_tm_3 += tiles * 4;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_transform_output_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
+{
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int outch = top_blob.c;
+
+    const int w_tiles = outw / 2;
+    const int h_tiles = outh / 2;
+    const int tiles = w_tiles * h_tiles;
+
+    const float* biasptr = bias;
+
+    // const float otm[2][4] = {
+    //     {1.0f,  1.0f,  1.0f,  0.0f},
+    //     {0.0f,  1.0f, -1.0f,  1.0f}
+    // };
+
+    // 0 = r00 + r01 + r02
+    // 1 = r01 - r02 + r03
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        const Mat out0_tm = top_blob_tm.channel(p);
+        Mat out0 = top_blob.channel(p);
+
+        float bias0 = biasptr ? biasptr[p] : 0.f;
+
+        float tmp[2][4];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j);
+                const float* output0_tm_1 = output0_tm_0 + tiles;
+                const float* output0_tm_2 = output0_tm_0 + tiles * 2;
+                const float* output0_tm_3 = output0_tm_0 + tiles * 3;
+
+                float* output0 = out0.row(i * 2) + (j * 2);
+
+                for (int m = 0; m < 4; m++)
+                {
+                    float out0tm0 = output0_tm_0[0];
+                    float out0tm1 = output0_tm_1[0];
+                    float out0tm2 = output0_tm_2[0];
+                    float out0tm3 = output0_tm_3[0];
+
+                    float tmp0m = out0tm0 + out0tm1 + out0tm2;
+                    float tmp1m = out0tm1 - out0tm2 + out0tm3;
+
+                    tmp[0][m] = tmp0m;
+                    tmp[1][m] = tmp1m;
+
+                    output0_tm_0 += tiles * 4;
+                    output0_tm_1 += tiles * 4;
+                    output0_tm_2 += tiles * 4;
+                    output0_tm_3 += tiles * 4;
+                }
+
+                for (int m = 0; m < 2; m++)
+                {
+                    float tmp00 = tmp[m][0];
+                    float tmp01 = tmp[m][1];
+                    float tmp02 = tmp[m][2];
+                    float tmp03 = tmp[m][3];
+
+                    float out00 = bias0 + tmp00 + tmp01 + tmp02;
+                    float out01 = bias0 + tmp01 - tmp02 + tmp03;
+
+                    output0[0] = out00;
+                    output0[1] = out01;
+
+                    output0 += outw;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_transform_int8.h b/src/layer/loongarch/convolution_winograd_transform_int8.h
new file mode 100644
index 00000000000..09ef669e473
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_transform_int8.h
@@ -0,0 +1,229 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd43_transform_input_int8_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
+{
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int inch = bottom_blob.c;
+
+    const int w_tiles = (w - 2) / 4;
+    const int h_tiles = (h - 2) / 4;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float itm[6][6] = {
+    //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
+    //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
+    //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
+    //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
+    //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
+    //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
+    // };
+
+    // 0 =  4 * r00 - 5 * r02 + r04
+    // 1 = -4 * (r01 + r02) + r04 + r03
+    // 2 =  4 * (r01 - r02) + r04 - r03
+    // 3 = -2 * (r01 - r03) + r04 - r02
+    // 4 =  2 * (r01 - r03) + r04 - r02
+    // 5 =  4 * r01 - 5 * r03 + r05
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < inch; q++)
+    {
+        const Mat img0 = bottom_blob.channel(q);
+        Mat img0_tm = bottom_blob_tm.channel(q);
+
+        short tmp[6][6];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const signed char* r0 = img0.row<const signed char>(i * 4) + (j * 4);
+
+                for (int m = 0; m < 6; m++)
+                {
+                    signed char r00 = r0[0];
+                    signed char r01 = r0[1];
+                    signed char r02 = r0[2];
+                    signed char r03 = r0[3];
+                    signed char r04 = r0[4];
+                    signed char r05 = r0[5];
+
+                    short tmp0m = 4 * r00 - 5 * r02 + r04;
+                    short tmp1m = -4 * (r01 + r02) + r04 + r03;
+                    short tmp2m = 4 * (r01 - r02) + r04 - r03;
+                    short tmp3m = -2 * (r01 - r03) + r04 - r02;
+                    short tmp4m = 2 * (r01 - r03) + r04 - r02;
+                    short tmp5m = 4 * r01 - 5 * r03 + r05;
+
+                    tmp[0][m] = tmp0m;
+                    tmp[1][m] = tmp1m;
+                    tmp[2][m] = tmp2m;
+                    tmp[3][m] = tmp3m;
+                    tmp[4][m] = tmp4m;
+                    tmp[5][m] = tmp5m;
+
+                    r0 += w;
+                }
+
+                short* r0_tm_0 = (short*)img0_tm + (i * w_tiles + j);
+                short* r0_tm_1 = r0_tm_0 + tiles;
+                short* r0_tm_2 = r0_tm_0 + tiles * 2;
+                short* r0_tm_3 = r0_tm_0 + tiles * 3;
+                short* r0_tm_4 = r0_tm_0 + tiles * 4;
+                short* r0_tm_5 = r0_tm_0 + tiles * 5;
+
+                for (int m = 0; m < 6; m++)
+                {
+                    short tmp00 = tmp[m][0];
+                    short tmp01 = tmp[m][1];
+                    short tmp02 = tmp[m][2];
+                    short tmp03 = tmp[m][3];
+                    short tmp04 = tmp[m][4];
+                    short tmp05 = tmp[m][5];
+
+                    short r0tm0 = 4 * tmp00 - 5 * tmp02 + tmp04;
+                    short r0tm1 = -4 * (tmp01 + tmp02) + tmp04 + tmp03;
+                    short r0tm2 = 4 * (tmp01 - tmp02) + tmp04 - tmp03;
+                    short r0tm3 = -2 * (tmp01 - tmp03) + tmp04 - tmp02;
+                    short r0tm4 = 2 * (tmp01 - tmp03) + tmp04 - tmp02;
+                    short r0tm5 = 4 * tmp01 - 5 * tmp03 + tmp05;
+
+                    r0_tm_0[0] = r0tm0;
+                    r0_tm_1[0] = r0tm1;
+                    r0_tm_2[0] = r0tm2;
+                    r0_tm_3[0] = r0tm3;
+                    r0_tm_4[0] = r0tm4;
+                    r0_tm_5[0] = r0tm5;
+
+                    r0_tm_0 += tiles * 6;
+                    r0_tm_1 += tiles * 6;
+                    r0_tm_2 += tiles * 6;
+                    r0_tm_3 += tiles * 6;
+                    r0_tm_4 += tiles * 6;
+                    r0_tm_5 += tiles * 6;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_transform_output_int8_lsx(const Mat& top_blob_tm, Mat& top_blob, const Option& opt)
+{
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int outch = top_blob.c;
+
+    const int w_tiles = outw / 4;
+    const int h_tiles = outh / 4;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float otm[4][6] = {
+    //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
+    //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
+    //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
+    //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
+    // };
+
+    // 0 = r00 + (r01 + r02) + (r03 + r04)
+    // 1 =       (r01 - r02) + (r03 - r04) * 2
+    // 2 =       (r01 + r02) + (r03 + r04) * 4
+    // 3 = r05 + (r01 - r02) + (r03 - r04) * 8
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        const Mat out0_tm = top_blob_tm.channel(p);
+        Mat out0 = top_blob.channel(p);
+
+        int tmp[4][6];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const int* output0_tm_0 = (const int*)out0_tm + (i * w_tiles + j) * 1;
+                const int* output0_tm_1 = output0_tm_0 + tiles * 1;
+                const int* output0_tm_2 = output0_tm_0 + tiles * 2;
+                const int* output0_tm_3 = output0_tm_0 + tiles * 3;
+                const int* output0_tm_4 = output0_tm_0 + tiles * 4;
+                const int* output0_tm_5 = output0_tm_0 + tiles * 5;
+
+                int* output0 = out0.row<int>(i * 4) + j * 4;
+
+                for (int m = 0; m < 5; m++)
+                {
+                    int tmp02a = output0_tm_1[0] + output0_tm_2[0];
+                    int tmp13a = output0_tm_1[0] - output0_tm_2[0];
+
+                    int tmp02b = output0_tm_3[0] + output0_tm_4[0];
+                    int tmp13b = output0_tm_3[0] - output0_tm_4[0];
+
+                    tmp[0][m] = output0_tm_0[0] + tmp02a + tmp02b;
+                    tmp[1][m] = tmp13a + tmp13b * 2;
+                    tmp[2][m] = tmp02a + tmp02b * 4;
+                    tmp[3][m] = output0_tm_5[0] * 4 + tmp13a + tmp13b * 8;
+
+                    output0_tm_0 += tiles * 6;
+                    output0_tm_1 += tiles * 6;
+                    output0_tm_2 += tiles * 6;
+                    output0_tm_3 += tiles * 6;
+                    output0_tm_4 += tiles * 6;
+                    output0_tm_5 += tiles * 6;
+                }
+                for (int m = 5; m < 6; m++)
+                {
+                    int tmp02a = output0_tm_1[0] + output0_tm_2[0];
+                    int tmp13a = output0_tm_1[0] - output0_tm_2[0];
+
+                    int tmp02b = output0_tm_3[0] + output0_tm_4[0];
+                    int tmp13b = output0_tm_3[0] - output0_tm_4[0];
+
+                    tmp[0][m] = (output0_tm_0[0] + tmp02a + tmp02b) * 4;
+                    tmp[1][m] = (tmp13a + tmp13b * 2) * 4;
+                    tmp[2][m] = (tmp02a + tmp02b * 4) * 4;
+                    tmp[3][m] = (output0_tm_5[0] * 4 + tmp13a + tmp13b * 8) * 4;
+
+                    output0_tm_0 += tiles * 6;
+                    output0_tm_1 += tiles * 6;
+                    output0_tm_2 += tiles * 6;
+                    output0_tm_3 += tiles * 6;
+                    output0_tm_4 += tiles * 6;
+                    output0_tm_5 += tiles * 6;
+                }
+
+                for (int m = 0; m < 4; m++)
+                {
+                    const int* tmp0 = tmp[m];
+
+                    int tmp02a = tmp0[1] + tmp0[2];
+                    int tmp13a = tmp0[1] - tmp0[2];
+
+                    int tmp02b = tmp0[3] + tmp0[4];
+                    int tmp13b = tmp0[3] - tmp0[4];
+
+                    output0[0] = (tmp0[0] + tmp02a + tmp02b) / 576;
+                    output0[1] = (tmp13a + tmp13b * 2) / 576;
+                    output0[2] = (tmp02a + tmp02b * 4) / 576;
+                    output0[3] = (tmp0[5] + tmp13a + tmp13b * 8) / 576;
+
+                    output0 += outw;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_transform_pack4.h b/src/layer/loongarch/convolution_winograd_transform_pack4.h
new file mode 100644
index 00000000000..3969e59cf09
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_transform_pack4.h
@@ -0,0 +1,730 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd63_transform_input_pack4_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
+{
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int inch = bottom_blob.c;
+
+    const int w_tiles = (w - 2) / 6;
+    const int h_tiles = (h - 2) / 6;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float itm[8][8] = {
+    //     {1.0f,  0.0f, -5.25f,  0.00f,  5.25f,  0.00f, -1.0f, 0.0f},
+    //
+    //     {0.0f,  1.0f,  1.00f, -4.25f, -4.25f,  1.00f,  1.0f, 0.0f},
+    //     {0.0f, -1.0f,  1.00f,  4.25f, -4.25f, -1.00f,  1.0f, 0.0f},
+    //
+    //     {0.0f,  0.5f,  0.25f, -2.50f, -1.25f,  2.00f,  1.0f, 0.0f},
+    //     {0.0f, -0.5f,  0.25f,  2.50f, -1.25f, -2.00f,  1.0f, 0.0f},
+    //
+    //     {0.0f,  2.0f,  4.00f, -2.50f, -5.00f,  0.50f,  1.0f, 0.0f},
+    //     {0.0f, -2.0f,  4.00f,  2.50f, -5.00f, -0.50f,  1.0f, 0.0f},
+    //
+    //     {0.0f, -1.0f,  0.00f,  5.25f,  0.00f, -5.25f,  0.0f, 1.0f}
+    // };
+
+    // 0 = r00 - r06 + (r04 - r02) * 5.25
+    // 7 = r07 - r01 + (r03 - r05) * 5.25
+
+    // 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05)
+    // 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05)
+
+    // 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2)
+    // 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2)
+
+    // reuse r04 * 1.25
+    // reuse r03 * 2.5
+    // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
+    // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < inch; q++)
+    {
+        const Mat img0 = bottom_blob.channel(q);
+        Mat img0_tm = bottom_blob_tm.channel(q);
+
+        float tmp[8][8][4];
+
+        __m128 _v5_25 = __lsx_vreplfr2vr_s(5.25f);
+        __m128 _vm4_25 = __lsx_vreplfr2vr_s(-4.25f);
+        __m128 _vm1_25 = __lsx_vreplfr2vr_s(-1.25f);
+        __m128 _v0_25 = __lsx_vreplfr2vr_s(0.25f);
+        __m128 _vm2_5 = __lsx_vreplfr2vr_s(-2.5f);
+        __m128 _v0_5 = __lsx_vreplfr2vr_s(0.5f);
+        __m128 _v2 = __lsx_vreplfr2vr_s(2.f);
+        __m128 _v4 = __lsx_vreplfr2vr_s(4.f);
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* r0 = img0.row(i * 6) + (j * 6) * 4;
+
+                for (int m = 0; m < 8; m++)
+                {
+                    __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                    __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                    __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+                    __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);
+                    __m128 _r05 = (__m128)__lsx_vld(r0 + 4 * 5, 0);
+                    __m128 _r06 = (__m128)__lsx_vld(r0 + 4 * 6, 0);
+                    __m128 _r07 = (__m128)__lsx_vld(r0 + 4 * 7, 0);
+
+                    __m128 _tmp0m = __lsx_vfmadd_s(__lsx_vfsub_s(_r04, _r02), _v5_25, __lsx_vfsub_s(_r00, _r06));
+                    __m128 _tmp7m = __lsx_vfmadd_s(__lsx_vfsub_s(_r03, _r05), _v5_25, __lsx_vfsub_s(_r07, _r01));
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp7m, tmp[7][m], 0);
+
+                    __m128 _tmp12a = __lsx_vfmadd_s(_r04, _vm4_25, __lsx_vfadd_s(_r02, _r06));
+                    __m128 _tmp12b = __lsx_vfmadd_s(_r03, _vm4_25, __lsx_vfadd_s(_r01, _r05));
+
+                    __m128 _tmp1m = __lsx_vfadd_s(_tmp12a, _tmp12b);
+                    __m128 _tmp2m = __lsx_vfsub_s(_tmp12a, _tmp12b);
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+                    __lsx_vst(_tmp2m, tmp[2][m], 0);
+
+                    __m128 _tmp34a = __lsx_vfmadd_s(_r04, _vm1_25, __lsx_vfmadd_s(_r02, _v0_25, _r06));
+                    __m128 _tmp34b = __lsx_vfmadd_s(_r05, _v2, __lsx_vfmadd_s(_r03, _vm2_5, __lsx_vfmul_s(_r01, _v0_5)));
+
+                    __m128 _tmp3m = __lsx_vfadd_s(_tmp34a, _tmp34b);
+                    __m128 _tmp4m = __lsx_vfsub_s(_tmp34a, _tmp34b);
+                    __lsx_vst(_tmp3m, tmp[3][m], 0);
+                    __lsx_vst(_tmp4m, tmp[4][m], 0);
+
+                    __m128 _tmp56a = __lsx_vfmadd_s(__lsx_vfmadd_s(_r04, _vm1_25, _r02), _v4, _r06);
+                    __m128 _tmp56b = __lsx_vfmadd_s(_r05, _v0_5, __lsx_vfmadd_s(_r03, _vm2_5, __lsx_vfmul_s(_r01, _v2)));
+
+                    __m128 _tmp5m = __lsx_vfadd_s(_tmp56a, _tmp56b);
+                    __m128 _tmp6m = __lsx_vfsub_s(_tmp56a, _tmp56b);
+                    __lsx_vst(_tmp5m, tmp[5][m], 0);
+                    __lsx_vst(_tmp6m, tmp[6][m], 0);
+
+                    r0 += w * 4;
+                }
+
+                float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j) * 4;
+                float* r0_tm_1 = r0_tm_0 + tiles * 4;
+                float* r0_tm_2 = r0_tm_0 + tiles * 4 * 2;
+                float* r0_tm_3 = r0_tm_0 + tiles * 4 * 3;
+                float* r0_tm_4 = r0_tm_0 + tiles * 4 * 4;
+                float* r0_tm_5 = r0_tm_0 + tiles * 4 * 5;
+                float* r0_tm_6 = r0_tm_0 + tiles * 4 * 6;
+                float* r0_tm_7 = r0_tm_0 + tiles * 4 * 7;
+
+                for (int m = 0; m < 8; m++)
+                {
+                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
+                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
+                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
+                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);
+                    __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0);
+                    __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0);
+                    __m128 _tmp06 = (__m128)__lsx_vld(tmp[m][6], 0);
+                    __m128 _tmp07 = (__m128)__lsx_vld(tmp[m][7], 0);
+
+                    __m128 _r0tm0 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp04, _tmp02), _v5_25, __lsx_vfsub_s(_tmp00, _tmp06));
+                    __m128 _r0tm7 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp03, _tmp05), _v5_25, __lsx_vfsub_s(_tmp07, _tmp01));
+
+                    __m128 _tmp12a = __lsx_vfmadd_s(_tmp04, _vm4_25, __lsx_vfadd_s(_tmp02, _tmp06));
+                    __m128 _tmp12b = __lsx_vfmadd_s(_tmp03, _vm4_25, __lsx_vfadd_s(_tmp01, _tmp05));
+
+                    __m128 _r0tm1 = __lsx_vfadd_s(_tmp12a, _tmp12b);
+                    __m128 _r0tm2 = __lsx_vfsub_s(_tmp12a, _tmp12b);
+
+                    __m128 _tmp34a = __lsx_vfmadd_s(_tmp04, _vm1_25, __lsx_vfmadd_s(_tmp02, _v0_25, _tmp06));
+                    __m128 _tmp34b = __lsx_vfmadd_s(_tmp05, _v2, __lsx_vfmadd_s(_tmp03, _vm2_5, __lsx_vfmul_s(_tmp01, _v0_5)));
+
+                    __m128 _r0tm3 = __lsx_vfadd_s(_tmp34a, _tmp34b);
+                    __m128 _r0tm4 = __lsx_vfsub_s(_tmp34a, _tmp34b);
+
+                    __m128 _tmp56a = __lsx_vfmadd_s(__lsx_vfmadd_s(_tmp04, _vm1_25, _tmp02), _v4, _tmp06);
+                    __m128 _tmp56b = __lsx_vfmadd_s(_tmp05, _v0_5, __lsx_vfmadd_s(_tmp03, _vm2_5, __lsx_vfmul_s(_tmp01, _v2)));
+
+                    __m128 _r0tm5 = __lsx_vfadd_s(_tmp56a, _tmp56b);
+                    __m128 _r0tm6 = __lsx_vfsub_s(_tmp56a, _tmp56b);
+
+                    __lsx_vst(_r0tm0, r0_tm_0, 0);
+                    __lsx_vst(_r0tm1, r0_tm_1, 0);
+                    __lsx_vst(_r0tm2, r0_tm_2, 0);
+                    __lsx_vst(_r0tm3, r0_tm_3, 0);
+                    __lsx_vst(_r0tm4, r0_tm_4, 0);
+                    __lsx_vst(_r0tm5, r0_tm_5, 0);
+                    __lsx_vst(_r0tm6, r0_tm_6, 0);
+                    __lsx_vst(_r0tm7, r0_tm_7, 0);
+
+                    r0_tm_0 += tiles * 4 * 8;
+                    r0_tm_1 += tiles * 4 * 8;
+                    r0_tm_2 += tiles * 4 * 8;
+                    r0_tm_3 += tiles * 4 * 8;
+                    r0_tm_4 += tiles * 4 * 8;
+                    r0_tm_5 += tiles * 4 * 8;
+                    r0_tm_6 += tiles * 4 * 8;
+                    r0_tm_7 += tiles * 4 * 8;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd63_transform_output_pack4_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
+{
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int outch = top_blob.c;
+
+    const int w_tiles = outw / 6;
+    const int h_tiles = outh / 6;
+    const int tiles = w_tiles * h_tiles;
+
+    const float* biasptr = bias;
+
+    // const float otm[6][8] = {
+    //     {1.0f,  1.0f,   1.0f,   1.0f,   1.0f,  32.0f, 32.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,   2.0f,  -2.0f,  16.0f,-16.0f, 0.0f},
+    //     {0.0f,  1.0f,   1.0f,   4.0f,   4.0f,   8.0f,  8.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,   8.0f,  -8.0f,   4.0f, -4.0f, 0.0f},
+    //     {0.0f,  1.0f,   1.0f,  16.0f,  16.0f,   2.0f,  2.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,  32.0f, -32.0f,   1.0f, -1.0f, 1.0f}
+    // };
+
+    // 0 = r0 + (r1 + r2) + (r3 + r4)     + (r5 + r6) * 32
+    // 1 =      (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
+    // 2 =      (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
+    // 3 =      (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
+    // 4 =      (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
+    // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        const Mat out0_tm = top_blob_tm.channel(p);
+        Mat out0 = top_blob.channel(p);
+
+        __m128 _bias0 = biasptr ? (__m128)__lsx_vld(biasptr + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+
+        float tmp[6][8][4];
+
+        __m128 _v32 = __lsx_vreplfr2vr_s(32.f);
+        __m128 _v16 = __lsx_vreplfr2vr_s(16.f);
+        __m128 _v8 = __lsx_vreplfr2vr_s(8.f);
+        __m128 _v4 = __lsx_vreplfr2vr_s(4.f);
+        __m128 _v2 = __lsx_vreplfr2vr_s(2.f);
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j) * 4;
+                const float* output0_tm_1 = output0_tm_0 + tiles * 4;
+                const float* output0_tm_2 = output0_tm_0 + tiles * 4 * 2;
+                const float* output0_tm_3 = output0_tm_0 + tiles * 4 * 3;
+                const float* output0_tm_4 = output0_tm_0 + tiles * 4 * 4;
+                const float* output0_tm_5 = output0_tm_0 + tiles * 4 * 5;
+                const float* output0_tm_6 = output0_tm_0 + tiles * 4 * 6;
+                const float* output0_tm_7 = output0_tm_0 + tiles * 4 * 7;
+
+                float* output0 = out0.row<float>(i * 6) + (j * 6) * 4;
+
+                for (int m = 0; m < 8; m++)
+                {
+                    __m128 _out0tm0 = (__m128)__lsx_vld(output0_tm_0, 0);
+                    __m128 _out0tm1 = (__m128)__lsx_vld(output0_tm_1, 0);
+                    __m128 _out0tm2 = (__m128)__lsx_vld(output0_tm_2, 0);
+                    __m128 _out0tm3 = (__m128)__lsx_vld(output0_tm_3, 0);
+                    __m128 _out0tm4 = (__m128)__lsx_vld(output0_tm_4, 0);
+                    __m128 _out0tm5 = (__m128)__lsx_vld(output0_tm_5, 0);
+                    __m128 _out0tm6 = (__m128)__lsx_vld(output0_tm_6, 0);
+                    __m128 _out0tm7 = (__m128)__lsx_vld(output0_tm_7, 0);
+
+                    __m128 _tmp024a = __lsx_vfadd_s(_out0tm1, _out0tm2);
+                    __m128 _tmp135a = __lsx_vfsub_s(_out0tm1, _out0tm2);
+
+                    __m128 _tmp024b = __lsx_vfadd_s(_out0tm3, _out0tm4);
+                    __m128 _tmp135b = __lsx_vfsub_s(_out0tm3, _out0tm4);
+
+                    __m128 _tmp024c = __lsx_vfadd_s(_out0tm5, _out0tm6);
+                    __m128 _tmp135c = __lsx_vfsub_s(_out0tm5, _out0tm6);
+
+                    __m128 _tmp0m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm0, _tmp024a), __lsx_vfmadd_s(_tmp024c, _v32, _tmp024b));
+                    __m128 _tmp2m = __lsx_vfmadd_s(_tmp024c, _v8, __lsx_vfmadd_s(_tmp024b, _v4, _tmp024a));
+                    __m128 _tmp4m = __lsx_vfmadd_s(_tmp024c, _v2, __lsx_vfmadd_s(_tmp024b, _v16, _tmp024a));
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp2m, tmp[2][m], 0);
+                    __lsx_vst(_tmp4m, tmp[4][m], 0);
+
+                    __m128 _tmp1m = __lsx_vfmadd_s(_tmp135c, _v16, __lsx_vfmadd_s(_tmp135b, _v2, _tmp135a));
+                    __m128 _tmp3m = __lsx_vfmadd_s(_tmp135c, _v4, __lsx_vfmadd_s(_tmp135b, _v8, _tmp135a));
+                    __m128 _tmp5m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm7, _tmp135a), __lsx_vfmadd_s(_tmp135b, _v32, _tmp135c));
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+                    __lsx_vst(_tmp3m, tmp[3][m], 0);
+                    __lsx_vst(_tmp5m, tmp[5][m], 0);
+
+                    output0_tm_0 += tiles * 4 * 8;
+                    output0_tm_1 += tiles * 4 * 8;
+                    output0_tm_2 += tiles * 4 * 8;
+                    output0_tm_3 += tiles * 4 * 8;
+                    output0_tm_4 += tiles * 4 * 8;
+                    output0_tm_5 += tiles * 4 * 8;
+                    output0_tm_6 += tiles * 4 * 8;
+                    output0_tm_7 += tiles * 4 * 8;
+                }
+
+                for (int m = 0; m < 6; m++)
+                {
+                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
+                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
+                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
+                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);
+                    __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0);
+                    __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0);
+                    __m128 _tmp06 = (__m128)__lsx_vld(tmp[m][6], 0);
+                    __m128 _tmp07 = (__m128)__lsx_vld(tmp[m][7], 0);
+
+                    __m128 _tmp024a = __lsx_vfadd_s(_tmp01, _tmp02);
+                    __m128 _tmp135a = __lsx_vfsub_s(_tmp01, _tmp02);
+
+                    __m128 _tmp024b = __lsx_vfadd_s(_tmp03, _tmp04);
+                    __m128 _tmp135b = __lsx_vfsub_s(_tmp03, _tmp04);
+
+                    __m128 _tmp024c = __lsx_vfadd_s(_tmp05, _tmp06);
+                    __m128 _tmp135c = __lsx_vfsub_s(_tmp05, _tmp06);
+
+                    __m128 _out00 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp00, _tmp024a), __lsx_vfmadd_s(_tmp024c, _v32, _tmp024b)));
+                    __m128 _out02 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp024c, _v8, __lsx_vfmadd_s(_tmp024b, _v4, _tmp024a)));
+                    __m128 _out04 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp024c, _v2, __lsx_vfmadd_s(_tmp024b, _v16, _tmp024a)));
+                    __lsx_vst(_out00, output0, 0);
+                    __lsx_vst(_out02, output0 + 4 * 2, 0);
+                    __lsx_vst(_out04, output0 + 4 * 4, 0);
+
+                    __m128 _out01 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp135c, _v16, __lsx_vfmadd_s(_tmp135b, _v2, _tmp135a)));
+                    __m128 _out03 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp135c, _v4, __lsx_vfmadd_s(_tmp135b, _v8, _tmp135a)));
+                    __m128 _out05 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp07, _tmp135a), __lsx_vfmadd_s(_tmp135b, _v32, _tmp135c)));
+                    __lsx_vst(_out01, output0 + 4, 0);
+                    __lsx_vst(_out03, output0 + 4 * 3, 0);
+                    __lsx_vst(_out05, output0 + 4 * 5, 0);
+
+                    output0 += outw * 4;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_transform_input_pack4_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
+{
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int inch = bottom_blob.c;
+
+    const int w_tiles = (w - 2) / 4;
+    const int h_tiles = (h - 2) / 4;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float itm[6][6] = {
+    //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
+    //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
+    //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
+    //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
+    //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
+    //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
+    // };
+
+    // 0 =  4 * r00 - 5 * r02 + r04
+    // 1 = -4 * (r01 + r02) + r04 + r03
+    // 2 =  4 * (r01 - r02) + r04 - r03
+    // 3 = -2 * (r01 - r03) + r04 - r02
+    // 4 =  2 * (r01 - r03) + r04 - r02
+    // 5 =  4 * r01 - 5 * r03 + r05
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < inch; q++)
+    {
+        const Mat img0 = bottom_blob.channel(q);
+        Mat img0_tm = bottom_blob_tm.channel(q);
+
+        float tmp[6][6][4];
+
+        __m128 _vm5 = __lsx_vreplfr2vr_s(-5.f);
+        __m128 _vm4 = __lsx_vreplfr2vr_s(-4.f);
+        __m128 _v4 = __lsx_vreplfr2vr_s(4.f);
+        __m128 _vm2 = __lsx_vreplfr2vr_s(-2.f);
+        __m128 _v2 = __lsx_vreplfr2vr_s(2.f);
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* r0 = img0.row(i * 4) + (j * 4) * 4;
+
+                for (int m = 0; m < 6; m++)
+                {
+                    __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                    __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                    __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+                    __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);
+                    __m128 _r05 = (__m128)__lsx_vld(r0 + 4 * 5, 0);
+
+                    __m128 _tmp0m = __lsx_vfmadd_s(_r02, _vm5, __lsx_vfmadd_s(_r00, _v4, _r04));
+                    __m128 _tmp1m = __lsx_vfmadd_s(__lsx_vfadd_s(_r01, _r02), _vm4, __lsx_vfadd_s(_r04, _r03));
+                    __m128 _tmp2m = __lsx_vfmadd_s(__lsx_vfsub_s(_r01, _r02), _v4, __lsx_vfsub_s(_r04, _r03));
+                    __m128 _tmp3m = __lsx_vfmadd_s(__lsx_vfsub_s(_r01, _r03), _vm2, __lsx_vfsub_s(_r04, _r02));
+                    __m128 _tmp4m = __lsx_vfmadd_s(__lsx_vfsub_s(_r01, _r03), _v2, __lsx_vfsub_s(_r04, _r02));
+                    __m128 _tmp5m = __lsx_vfmadd_s(_r03, _vm5, __lsx_vfmadd_s(_r01, _v4, _r05));
+
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+                    __lsx_vst(_tmp2m, tmp[2][m], 0);
+                    __lsx_vst(_tmp3m, tmp[3][m], 0);
+                    __lsx_vst(_tmp4m, tmp[4][m], 0);
+                    __lsx_vst(_tmp5m, tmp[5][m], 0);
+
+                    r0 += w * 4;
+                }
+
+                float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j) * 4;
+                float* r0_tm_1 = r0_tm_0 + tiles * 4;
+                float* r0_tm_2 = r0_tm_0 + tiles * 4 * 2;
+                float* r0_tm_3 = r0_tm_0 + tiles * 4 * 3;
+                float* r0_tm_4 = r0_tm_0 + tiles * 4 * 4;
+                float* r0_tm_5 = r0_tm_0 + tiles * 4 * 5;
+
+                for (int m = 0; m < 6; m++)
+                {
+                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
+                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
+                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
+                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);
+                    __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0);
+                    __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0);
+
+                    __m128 _r0tm0 = __lsx_vfmadd_s(_tmp02, _vm5, __lsx_vfmadd_s(_tmp00, _v4, _tmp04));
+                    __m128 _r0tm1 = __lsx_vfmadd_s(__lsx_vfadd_s(_tmp01, _tmp02), _vm4, __lsx_vfadd_s(_tmp04, _tmp03));
+                    __m128 _r0tm2 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp01, _tmp02), _v4, __lsx_vfsub_s(_tmp04, _tmp03));
+                    __m128 _r0tm3 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp01, _tmp03), _vm2, __lsx_vfsub_s(_tmp04, _tmp02));
+                    __m128 _r0tm4 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp01, _tmp03), _v2, __lsx_vfsub_s(_tmp04, _tmp02));
+                    __m128 _r0tm5 = __lsx_vfmadd_s(_tmp03, _vm5, __lsx_vfmadd_s(_tmp01, _v4, _tmp05));
+
+                    __lsx_vst(_r0tm0, r0_tm_0, 0);
+                    __lsx_vst(_r0tm1, r0_tm_1, 0);
+                    __lsx_vst(_r0tm2, r0_tm_2, 0);
+                    __lsx_vst(_r0tm3, r0_tm_3, 0);
+                    __lsx_vst(_r0tm4, r0_tm_4, 0);
+                    __lsx_vst(_r0tm5, r0_tm_5, 0);
+
+                    r0_tm_0 += tiles * 4 * 6;
+                    r0_tm_1 += tiles * 4 * 6;
+                    r0_tm_2 += tiles * 4 * 6;
+                    r0_tm_3 += tiles * 4 * 6;
+                    r0_tm_4 += tiles * 4 * 6;
+                    r0_tm_5 += tiles * 4 * 6;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_transform_output_pack4_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
+{
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int outch = top_blob.c;
+
+    const int w_tiles = outw / 4;
+    const int h_tiles = outh / 4;
+    const int tiles = w_tiles * h_tiles;
+
+    const float* biasptr = bias;
+
+    // const float otm[4][6] = {
+    //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
+    //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
+    //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
+    //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
+    // };
+
+    // 0 = r00 + (r01 + r02) + (r03 + r04)
+    // 1 =       (r01 - r02) + (r03 - r04) * 2
+    // 2 =       (r01 + r02) + (r03 + r04) * 4
+    // 3 = r05 + (r01 - r02) + (r03 - r04) * 8
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        const Mat out0_tm = top_blob_tm.channel(p);
+        Mat out0 = top_blob.channel(p);
+
+        __m128 _bias0 = biasptr ? (__m128)__lsx_vld(biasptr + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+
+        float tmp[4][6][4];
+
+        __m128 _v2 = __lsx_vreplfr2vr_s(2.f);
+        __m128 _v4 = __lsx_vreplfr2vr_s(4.f);
+        __m128 _v8 = __lsx_vreplfr2vr_s(8.f);
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j) * 4;
+                const float* output0_tm_1 = output0_tm_0 + tiles * 4;
+                const float* output0_tm_2 = output0_tm_0 + tiles * 4 * 2;
+                const float* output0_tm_3 = output0_tm_0 + tiles * 4 * 3;
+                const float* output0_tm_4 = output0_tm_0 + tiles * 4 * 4;
+                const float* output0_tm_5 = output0_tm_0 + tiles * 4 * 5;
+
+                float* output0 = out0.row<float>(i * 4) + (j * 4) * 4;
+
+                for (int m = 0; m < 6; m++)
+                {
+                    __m128 _out0tm0 = (__m128)__lsx_vld(output0_tm_0, 0);
+                    __m128 _out0tm1 = (__m128)__lsx_vld(output0_tm_1, 0);
+                    __m128 _out0tm2 = (__m128)__lsx_vld(output0_tm_2, 0);
+                    __m128 _out0tm3 = (__m128)__lsx_vld(output0_tm_3, 0);
+                    __m128 _out0tm4 = (__m128)__lsx_vld(output0_tm_4, 0);
+                    __m128 _out0tm5 = (__m128)__lsx_vld(output0_tm_5, 0);
+
+                    __m128 _tmp02a = __lsx_vfadd_s(_out0tm1, _out0tm2);
+                    __m128 _tmp13a = __lsx_vfsub_s(_out0tm1, _out0tm2);
+
+                    __m128 _tmp02b = __lsx_vfadd_s(_out0tm3, _out0tm4);
+                    __m128 _tmp13b = __lsx_vfsub_s(_out0tm3, _out0tm4);
+
+                    __m128 _tmp0m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm0, _tmp02a), _tmp02b);
+                    __m128 _tmp1m = __lsx_vfmadd_s(_tmp13b, _v2, _tmp13a);
+                    __m128 _tmp2m = __lsx_vfmadd_s(_tmp02b, _v4, _tmp02a);
+                    __m128 _tmp3m = __lsx_vfmadd_s(_tmp13b, _v8, __lsx_vfadd_s(_out0tm5, _tmp13a));
+
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+                    __lsx_vst(_tmp2m, tmp[2][m], 0);
+                    __lsx_vst(_tmp3m, tmp[3][m], 0);
+
+                    output0_tm_0 += tiles * 4 * 6;
+                    output0_tm_1 += tiles * 4 * 6;
+                    output0_tm_2 += tiles * 4 * 6;
+                    output0_tm_3 += tiles * 4 * 6;
+                    output0_tm_4 += tiles * 4 * 6;
+                    output0_tm_5 += tiles * 4 * 6;
+                }
+
+                for (int m = 0; m < 4; m++)
+                {
+                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
+                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
+                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
+                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);
+                    __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0);
+                    __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0);
+
+                    __m128 _tmp02a = __lsx_vfadd_s(_tmp01, _tmp02);
+                    __m128 _tmp13a = __lsx_vfsub_s(_tmp01, _tmp02);
+
+                    __m128 _tmp02b = __lsx_vfadd_s(_tmp03, _tmp04);
+                    __m128 _tmp13b = __lsx_vfsub_s(_tmp03, _tmp04);
+
+                    __m128 _out00 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp00, _tmp02a), _tmp02b));
+                    __m128 _out01 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp13b, _v2, _tmp13a));
+                    __m128 _out02 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp02b, _v4, _tmp02a));
+                    __m128 _out03 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp13b, _v8, __lsx_vfadd_s(_tmp05, _tmp13a)));
+
+                    __lsx_vst(_out00, output0, 0);
+                    __lsx_vst(_out01, output0 + 4, 0);
+                    __lsx_vst(_out02, output0 + 4 * 2, 0);
+                    __lsx_vst(_out03, output0 + 4 * 3, 0);
+
+                    output0 += outw * 4;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_transform_input_pack4_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
+{
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int inch = bottom_blob.c;
+
+    const int w_tiles = (w - 2) / 2;
+    const int h_tiles = (h - 2) / 2;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float itm[4][4] = {
+    //     {1.0f,  0.0f, -1.0f,  0.0f},
+    //     {0.0f,  1.0f,  1.00f, 0.0f},
+    //     {0.0f, -1.0f,  1.00f, 0.0f},
+    //     {0.0f, -1.0f,  0.00f, 1.0f}
+    // };
+
+    // 0 = r00 - r02
+    // 1 = r01 + r02
+    // 2 = r02 - r01
+    // 3 = r03 - r01
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < inch; q++)
+    {
+        const Mat img0 = bottom_blob.channel(q);
+        Mat img0_tm = bottom_blob_tm.channel(q);
+
+        float tmp[4][4][4];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* r0 = img0.row(i * 2) + (j * 2) * 4;
+
+                for (int m = 0; m < 4; m++)
+                {
+                    __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                    __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                    __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                    __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+
+                    __m128 _tmp0m = __lsx_vfsub_s(_r00, _r02);
+                    __m128 _tmp1m = __lsx_vfadd_s(_r01, _r02);
+                    __m128 _tmp2m = __lsx_vfsub_s(_r02, _r01);
+                    __m128 _tmp3m = __lsx_vfsub_s(_r03, _r01);
+
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+                    __lsx_vst(_tmp2m, tmp[2][m], 0);
+                    __lsx_vst(_tmp3m, tmp[3][m], 0);
+
+                    r0 += w * 4;
+                }
+
+                float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j) * 4;
+                float* r0_tm_1 = r0_tm_0 + tiles * 4;
+                float* r0_tm_2 = r0_tm_0 + tiles * 4 * 2;
+                float* r0_tm_3 = r0_tm_0 + tiles * 4 * 3;
+
+                for (int m = 0; m < 4; m++)
+                {
+                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
+                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
+                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
+                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);
+
+                    __m128 _r0tm0 = __lsx_vfsub_s(_tmp00, _tmp02);
+                    __m128 _r0tm1 = __lsx_vfadd_s(_tmp01, _tmp02);
+                    __m128 _r0tm2 = __lsx_vfsub_s(_tmp02, _tmp01);
+                    __m128 _r0tm3 = __lsx_vfsub_s(_tmp03, _tmp01);
+
+                    __lsx_vst(_r0tm0, r0_tm_0, 0);
+                    __lsx_vst(_r0tm1, r0_tm_1, 0);
+                    __lsx_vst(_r0tm2, r0_tm_2, 0);
+                    __lsx_vst(_r0tm3, r0_tm_3, 0);
+
+                    r0_tm_0 += tiles * 4 * 4;
+                    r0_tm_1 += tiles * 4 * 4;
+                    r0_tm_2 += tiles * 4 * 4;
+                    r0_tm_3 += tiles * 4 * 4;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_transform_output_pack4_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
+{
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int outch = top_blob.c;
+
+    const int w_tiles = outw / 2;
+    const int h_tiles = outh / 2;
+    const int tiles = w_tiles * h_tiles;
+
+    const float* biasptr = bias;
+
+    // const float otm[2][4] = {
+    //     {1.0f,  1.0f,  1.0f,  0.0f},
+    //     {0.0f,  1.0f, -1.0f,  1.0f}
+    // };
+
+    // 0 = r00 + r01 + r02
+    // 1 = r01 - r02 + r03
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        const Mat out0_tm = top_blob_tm.channel(p);
+        Mat out0 = top_blob.channel(p);
+
+        __m128 _bias0 = biasptr ? (__m128)__lsx_vld(biasptr + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+
+        float tmp[2][4][4];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j) * 4;
+                const float* output0_tm_1 = output0_tm_0 + tiles * 4;
+                const float* output0_tm_2 = output0_tm_0 + tiles * 4 * 2;
+                const float* output0_tm_3 = output0_tm_0 + tiles * 4 * 3;
+
+                float* output0 = out0.row<float>(i * 2) + (j * 2) * 4;
+
+                for (int m = 0; m < 4; m++)
+                {
+                    __m128 _out0tm0 = (__m128)__lsx_vld(output0_tm_0, 0);
+                    __m128 _out0tm1 = (__m128)__lsx_vld(output0_tm_1, 0);
+                    __m128 _out0tm2 = (__m128)__lsx_vld(output0_tm_2, 0);
+                    __m128 _out0tm3 = (__m128)__lsx_vld(output0_tm_3, 0);
+
+                    __m128 _tmp0m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm0, _out0tm1), _out0tm2);
+                    __m128 _tmp1m = __lsx_vfadd_s(__lsx_vfsub_s(_out0tm1, _out0tm2), _out0tm3);
+
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+
+                    output0_tm_0 += tiles * 4 * 4;
+                    output0_tm_1 += tiles * 4 * 4;
+                    output0_tm_2 += tiles * 4 * 4;
+                    output0_tm_3 += tiles * 4 * 4;
+                }
+
+                for (int m = 0; m < 2; m++)
+                {
+                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
+                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
+                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
+                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);
+
+                    __m128 _out00 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp00, _tmp01), _tmp02));
+                    __m128 _out01 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfsub_s(_tmp01, _tmp02), _tmp03));
+
+                    __lsx_vst(_out00, output0, 0);
+                    __lsx_vst(_out01, output0 + 4, 0);
+
+                    output0 += outw * 4;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_transform_pack4_int8.h b/src/layer/loongarch/convolution_winograd_transform_pack4_int8.h
new file mode 100644
index 00000000000..8b31ce97a86
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_transform_pack4_int8.h
@@ -0,0 +1,166 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd43_transform_output_pack4_int8_lsx(const Mat& top_blob_tm, Mat& top_blob, const Option& opt)
+{
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int outch = top_blob.c;
+
+    const int w_tiles = outw / 4;
+    const int h_tiles = outh / 4;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float otm[4][6] = {
+    //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
+    //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
+    //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
+    //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
+    // };
+
+    // 0 = r00 + (r01 + r02) + (r03 + r04)
+    // 1 =       (r01 - r02) + (r03 - r04) * 2
+    // 2 =       (r01 + r02) + (r03 + r04) * 4
+    // 3 = r05 + (r01 - r02) + (r03 - r04) * 8
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        const Mat out0_tm = top_blob_tm.channel(p);
+        Mat out0 = top_blob.channel(p);
+
+        int tmp[4][6][4];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const int* output0_tm_0 = (const int*)out0_tm + (i * w_tiles + j) * 4;
+                const int* output0_tm_1 = output0_tm_0 + tiles * 4;
+                const int* output0_tm_2 = output0_tm_0 + tiles * 8;
+                const int* output0_tm_3 = output0_tm_0 + tiles * 12;
+                const int* output0_tm_4 = output0_tm_0 + tiles * 16;
+                const int* output0_tm_5 = output0_tm_0 + tiles * 20;
+
+                int* output0 = out0.row<int>(i * 4) + (j * 4) * 4;
+
+                for (int m = 0; m < 5; m++)
+                {
+                    __m128i _out0tm0 = __lsx_vld(output0_tm_0, 0);
+                    __m128i _out0tm1 = __lsx_vld(output0_tm_1, 0);
+                    __m128i _out0tm2 = __lsx_vld(output0_tm_2, 0);
+                    __m128i _out0tm3 = __lsx_vld(output0_tm_3, 0);
+                    __m128i _out0tm4 = __lsx_vld(output0_tm_4, 0);
+                    __m128i _out0tm5 = __lsx_vld(output0_tm_5, 0);
+
+                    __m128i _tmp02a = __lsx_vadd_w(_out0tm1, _out0tm2);
+                    __m128i _tmp13a = __lsx_vsub_w(_out0tm1, _out0tm2);
+
+                    __m128i _tmp02b = __lsx_vadd_w(_out0tm3, _out0tm4);
+                    __m128i _tmp13b = __lsx_vsub_w(_out0tm3, _out0tm4);
+
+                    __m128i _tmp0m = __lsx_vadd_w(__lsx_vadd_w(_out0tm0, _tmp02a), _tmp02b);
+                    __m128i _tmp1m = __lsx_vadd_w(_tmp13a, __lsx_vslli_w(_tmp13b, 1));
+                    __m128i _tmp2m = __lsx_vadd_w(_tmp02a, __lsx_vslli_w(_tmp02b, 2));
+                    __m128i _tmp3m = __lsx_vadd_w(__lsx_vadd_w(_tmp13a, __lsx_vslli_w(_out0tm5, 2)), __lsx_vslli_w(_tmp13b, 3));
+
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+                    __lsx_vst(_tmp2m, tmp[2][m], 0);
+                    __lsx_vst(_tmp3m, tmp[3][m], 0);
+
+                    output0_tm_0 += tiles * 24;
+                    output0_tm_1 += tiles * 24;
+                    output0_tm_2 += tiles * 24;
+                    output0_tm_3 += tiles * 24;
+                    output0_tm_4 += tiles * 24;
+                    output0_tm_5 += tiles * 24;
+                }
+                for (int m = 5; m < 6; m++)
+                {
+                    __m128i _out0tm0 = __lsx_vld(output0_tm_0, 0);
+                    __m128i _out0tm1 = __lsx_vld(output0_tm_1, 0);
+                    __m128i _out0tm2 = __lsx_vld(output0_tm_2, 0);
+                    __m128i _out0tm3 = __lsx_vld(output0_tm_3, 0);
+                    __m128i _out0tm4 = __lsx_vld(output0_tm_4, 0);
+                    __m128i _out0tm5 = __lsx_vld(output0_tm_5, 0);
+
+                    __m128i _tmp02a = __lsx_vadd_w(_out0tm1, _out0tm2);
+                    __m128i _tmp13a = __lsx_vsub_w(_out0tm1, _out0tm2);
+
+                    __m128i _tmp02b = __lsx_vadd_w(_out0tm3, _out0tm4);
+                    __m128i _tmp13b = __lsx_vsub_w(_out0tm3, _out0tm4);
+
+                    __m128i _tmp0m = __lsx_vadd_w(__lsx_vadd_w(_out0tm0, _tmp02a), _tmp02b);
+                    __m128i _tmp1m = __lsx_vadd_w(_tmp13a, __lsx_vslli_w(_tmp13b, 1));
+                    __m128i _tmp2m = __lsx_vadd_w(_tmp02a, __lsx_vslli_w(_tmp02b, 2));
+                    __m128i _tmp3m = __lsx_vadd_w(__lsx_vadd_w(_tmp13a, __lsx_vslli_w(_out0tm5, 2)), __lsx_vslli_w(_tmp13b, 3));
+
+                    _tmp0m = __lsx_vslli_w(_tmp0m, 2);
+                    _tmp1m = __lsx_vslli_w(_tmp1m, 2);
+                    _tmp2m = __lsx_vslli_w(_tmp2m, 2);
+                    _tmp3m = __lsx_vslli_w(_tmp3m, 2);
+
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+                    __lsx_vst(_tmp2m, tmp[2][m], 0);
+                    __lsx_vst(_tmp3m, tmp[3][m], 0);
+
+                    output0_tm_0 += tiles * 24;
+                    output0_tm_1 += tiles * 24;
+                    output0_tm_2 += tiles * 24;
+                    output0_tm_3 += tiles * 24;
+                    output0_tm_4 += tiles * 24;
+                    output0_tm_5 += tiles * 24;
+                }
+
+                for (int m = 0; m < 4; m++)
+                {
+                    __m128i _tmp00 = __lsx_vld(tmp[m][0], 0);
+                    __m128i _tmp01 = __lsx_vld(tmp[m][1], 0);
+                    __m128i _tmp02 = __lsx_vld(tmp[m][2], 0);
+                    __m128i _tmp03 = __lsx_vld(tmp[m][3], 0);
+                    __m128i _tmp04 = __lsx_vld(tmp[m][4], 0);
+                    __m128i _tmp05 = __lsx_vld(tmp[m][5], 0);
+
+                    __m128i _tmp02a = __lsx_vadd_w(_tmp01, _tmp02);
+                    __m128i _tmp13a = __lsx_vsub_w(_tmp01, _tmp02);
+
+                    __m128i _tmp02b = __lsx_vadd_w(_tmp03, _tmp04);
+                    __m128i _tmp13b = __lsx_vsub_w(_tmp03, _tmp04);
+
+                    __m128i _out00 = __lsx_vadd_w(__lsx_vadd_w(_tmp00, _tmp02a), _tmp02b);
+                    __m128i _out01 = __lsx_vadd_w(_tmp13a, __lsx_vslli_w(_tmp13b, 1));
+                    __m128i _out02 = __lsx_vadd_w(_tmp02a, __lsx_vslli_w(_tmp02b, 2));
+                    __m128i _out03 = __lsx_vadd_w(__lsx_vadd_w(_tmp05, _tmp13a), __lsx_vslli_w(_tmp13b, 3));
+
+                    // TODO use integer trick for division by 576
+                    __m128 _v576 = __lsx_vreplfr2vr_s(1.0 / 576);
+                    _out00 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out00), _v576));
+                    _out01 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out01), _v576));
+                    _out02 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out02), _v576));
+                    _out03 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out03), _v576));
+
+                    __lsx_vst(_out00, output0, 0);
+                    __lsx_vst(_out01, output0 + 4, 0);
+                    __lsx_vst(_out02, output0 + 8, 0);
+                    __lsx_vst(_out03, output0 + 12, 0);
+
+                    output0 += outw * 4;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolution_winograd_transform_pack8_int8.h b/src/layer/loongarch/convolution_winograd_transform_pack8_int8.h
new file mode 100644
index 00000000000..5e49a87669a
--- /dev/null
+++ b/src/layer/loongarch/convolution_winograd_transform_pack8_int8.h
@@ -0,0 +1,132 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd43_transform_input_pack8_int8_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
+{
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int inch = bottom_blob.c;
+
+    const int w_tiles = (w - 2) / 4;
+    const int h_tiles = (h - 2) / 4;
+    const int tiles = w_tiles * h_tiles;
+
+    // const float itm[6][6] = {
+    //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
+    //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
+    //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
+    //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
+    //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
+    //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
+    // };
+
+    // 0 =  4 * r00 - 5 * r02 + r04
+    // 1 = -4 * (r01 + r02) + r04 + r03
+    // 2 =  4 * (r01 - r02) + r04 - r03
+    // 3 = -2 * (r01 - r03) + r04 - r02
+    // 4 =  2 * (r01 - r03) + r04 - r02
+    // 5 =  4 * r01 - 5 * r03 + r05
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < inch; q++)
+    {
+        const Mat img0 = bottom_blob.channel(q);
+        Mat img0_tm = bottom_blob_tm.channel(q);
+
+        short tmp[6][6][8];
+
+        // tile
+        for (int i = 0; i < h_tiles; i++)
+        {
+            for (int j = 0; j < w_tiles; j++)
+            {
+                const signed char* r0 = img0.row<const signed char>(i * 4) + (j * 4) * 8;
+
+                for (int m = 0; m < 6; m++)
+                {
+                    __m128i _r00_01 = __lsx_vld(r0, 0);
+                    __m128i _r02_03 = __lsx_vld(r0 + 16, 0);
+                    __m128i _r04_05 = __lsx_vld(r0 + 32, 0);
+                    __m128i _extr0001 = __lsx_vslti_b(_r00_01, 0);
+                    __m128i _extr0203 = __lsx_vslti_b(_r02_03, 0);
+                    __m128i _extr0405 = __lsx_vslti_b(_r04_05, 0);
+                    __m128i _r00 = __lsx_vilvl_b(_extr0001, _r00_01);
+                    __m128i _r01 = __lsx_vilvh_b(_extr0001, _r00_01);
+                    __m128i _r02 = __lsx_vilvl_b(_extr0203, _r02_03);
+                    __m128i _r03 = __lsx_vilvh_b(_extr0203, _r02_03);
+                    __m128i _r04 = __lsx_vilvl_b(_extr0405, _r04_05);
+                    __m128i _r05 = __lsx_vilvh_b(_extr0405, _r04_05);
+
+                    __m128i _v5 = __lsx_vreplgr2vr_h(5);
+
+                    __m128i _tmp0m = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_r00, 2), _r04), __lsx_vmul_h(_r02, _v5));
+                    __m128i _tmp1m = __lsx_vsub_h(__lsx_vadd_h(_r04, _r03), __lsx_vslli_h(__lsx_vadd_h(_r01, _r02), 2));
+                    __m128i _tmp2m = __lsx_vadd_h(__lsx_vsub_h(_r04, _r03), __lsx_vslli_h(__lsx_vsub_h(_r01, _r02), 2));
+                    __m128i _tmp3m = __lsx_vsub_h(__lsx_vsub_h(_r04, _r02), __lsx_vslli_h(__lsx_vsub_h(_r01, _r03), 1));
+                    __m128i _tmp4m = __lsx_vadd_h(__lsx_vsub_h(_r04, _r02), __lsx_vslli_h(__lsx_vsub_h(_r01, _r03), 1));
+                    __m128i _tmp5m = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_r01, 2), _r05), __lsx_vmul_h(_r03, _v5));
+
+                    __lsx_vst(_tmp0m, tmp[0][m], 0);
+                    __lsx_vst(_tmp1m, tmp[1][m], 0);
+                    __lsx_vst(_tmp2m, tmp[2][m], 0);
+                    __lsx_vst(_tmp3m, tmp[3][m], 0);
+                    __lsx_vst(_tmp4m, tmp[4][m], 0);
+                    __lsx_vst(_tmp5m, tmp[5][m], 0);
+
+                    r0 += w * 8;
+                }
+
+                short* r0_tm_0 = (short*)img0_tm + (i * w_tiles + j) * 8;
+                short* r0_tm_1 = r0_tm_0 + tiles * 8;
+                short* r0_tm_2 = r0_tm_0 + tiles * 16;
+                short* r0_tm_3 = r0_tm_0 + tiles * 24;
+                short* r0_tm_4 = r0_tm_0 + tiles * 32;
+                short* r0_tm_5 = r0_tm_0 + tiles * 40;
+
+                for (int m = 0; m < 6; m++)
+                {
+                    __m128i _tmp00 = __lsx_vld(tmp[m][0], 0);
+                    __m128i _tmp01 = __lsx_vld(tmp[m][1], 0);
+                    __m128i _tmp02 = __lsx_vld(tmp[m][2], 0);
+                    __m128i _tmp03 = __lsx_vld(tmp[m][3], 0);
+                    __m128i _tmp04 = __lsx_vld(tmp[m][4], 0);
+                    __m128i _tmp05 = __lsx_vld(tmp[m][5], 0);
+
+                    __m128i _v5 = __lsx_vreplgr2vr_h(5);
+
+                    __m128i _r0tm0 = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_tmp00, 2), _tmp04), __lsx_vmul_h(_tmp02, _v5));
+                    __m128i _r0tm1 = __lsx_vsub_h(__lsx_vadd_h(_tmp04, _tmp03), __lsx_vslli_h(__lsx_vadd_h(_tmp01, _tmp02), 2));
+                    __m128i _r0tm2 = __lsx_vadd_h(__lsx_vsub_h(_tmp04, _tmp03), __lsx_vslli_h(__lsx_vsub_h(_tmp01, _tmp02), 2));
+                    __m128i _r0tm3 = __lsx_vsub_h(__lsx_vsub_h(_tmp04, _tmp02), __lsx_vslli_h(__lsx_vsub_h(_tmp01, _tmp03), 1));
+                    __m128i _r0tm4 = __lsx_vadd_h(__lsx_vsub_h(_tmp04, _tmp02), __lsx_vslli_h(__lsx_vsub_h(_tmp01, _tmp03), 1));
+                    __m128i _r0tm5 = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_tmp01, 2), _tmp05), __lsx_vmul_h(_tmp03, _v5));
+
+                    __lsx_vst(_r0tm0, r0_tm_0, 0);
+                    __lsx_vst(_r0tm1, r0_tm_1, 0);
+                    __lsx_vst(_r0tm2, r0_tm_2, 0);
+                    __lsx_vst(_r0tm3, r0_tm_3, 0);
+                    __lsx_vst(_r0tm4, r0_tm_4, 0);
+                    __lsx_vst(_r0tm5, r0_tm_5, 0);
+
+                    r0_tm_0 += tiles * 48;
+                    r0_tm_1 += tiles * 48;
+                    r0_tm_2 += tiles * 48;
+                    r0_tm_3 += tiles * 48;
+                    r0_tm_4 += tiles * 48;
+                    r0_tm_5 += tiles * 48;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolutiondepthwise_3x3.h b/src/layer/loongarch/convolutiondepthwise_3x3.h
new file mode 100644
index 00000000000..1c37f7789f3
--- /dev/null
+++ b/src/layer/loongarch/convolutiondepthwise_3x3.h
@@ -0,0 +1,193 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convdw3x3s1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int group = bottom_blob.c;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        Mat out = top_blob.channel(g);
+
+        const float bias0 = bias ? bias[g] : 0.f;
+
+        const float* kernel0 = kernel + g * 9;
+
+        float* outptr0 = out;
+        float* outptr1 = outptr0 + outw;
+
+        const float* img0 = bottom_blob.channel(g);
+
+        const float* r0 = img0;
+        const float* r1 = img0 + w;
+        const float* r2 = img0 + w * 2;
+        const float* r3 = img0 + w * 3;
+
+        const float* k0 = kernel0;
+        const float* k1 = kernel0 + 3;
+        const float* k2 = kernel0 + 6;
+
+        int i = 0;
+
+        for (; i + 1 < outh; i += 2)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                float sum = bias0;
+                float sum2 = bias0;
+
+                sum += r0[0] * k0[0];
+                sum += r0[1] * k0[1];
+                sum += r0[2] * k0[2];
+                sum2 += r1[0] * k0[0];
+                sum2 += r1[1] * k0[1];
+                sum2 += r1[2] * k0[2];
+                sum += r1[0] * k1[0];
+                sum += r1[1] * k1[1];
+                sum += r1[2] * k1[2];
+                sum2 += r2[0] * k1[0];
+                sum2 += r2[1] * k1[1];
+                sum2 += r2[2] * k1[2];
+                sum += r2[0] * k2[0];
+                sum += r2[1] * k2[1];
+                sum += r2[2] * k2[2];
+                sum2 += r3[0] * k2[0];
+                sum2 += r3[1] * k2[1];
+                sum2 += r3[2] * k2[2];
+
+                *outptr0 = sum;
+                *outptr1 = sum2;
+
+                r0++;
+                r1++;
+                r2++;
+                r3++;
+                outptr0++;
+                outptr1++;
+            }
+
+            r0 += 2 + w;
+            r1 += 2 + w;
+            r2 += 2 + w;
+            r3 += 2 + w;
+
+            outptr0 += outw;
+            outptr1 += outw;
+        }
+
+        for (; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                float sum = bias0;
+                sum += r0[0] * k0[0];
+                sum += r0[1] * k0[1];
+                sum += r0[2] * k0[2];
+                sum += r1[0] * k1[0];
+                sum += r1[1] * k1[1];
+                sum += r1[2] * k1[2];
+                sum += r2[0] * k2[0];
+                sum += r2[1] * k2[1];
+                sum += r2[2] * k2[2];
+
+                *outptr0 = sum;
+
+                r0++;
+                r1++;
+                r2++;
+                outptr0++;
+            }
+
+            r0 += 2;
+            r1 += 2;
+            r2 += 2;
+        }
+    }
+}
+
+static void convdw3x3s2_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int group = bottom_blob.c;
+
+    const int tailstep = w - 2 * outw + w;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        Mat out = top_blob.channel(g);
+
+        const float bias0 = bias ? bias[g] : 0.f;
+
+        const float* kernel0 = kernel + g * 9;
+
+        float* outptr = out;
+
+        const float* img0 = bottom_blob.channel(g);
+
+        const float* r0 = img0;
+        const float* r1 = img0 + w;
+        const float* r2 = img0 + w * 2;
+
+        const float* k0 = kernel0;
+        const float* k1 = kernel0 + 3;
+        const float* k2 = kernel0 + 6;
+
+        int i = 0;
+
+        for (; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                float sum = bias0;
+                sum += r0[0] * k0[0];
+                sum += r0[1] * k0[1];
+                sum += r0[2] * k0[2];
+                sum += r1[0] * k1[0];
+                sum += r1[1] * k1[1];
+                sum += r1[2] * k1[2];
+                sum += r2[0] * k2[0];
+                sum += r2[1] * k2[1];
+                sum += r2[2] * k2[2];
+
+                *outptr = sum;
+
+                r0 += 2;
+                r1 += 2;
+                r2 += 2;
+                outptr++;
+            }
+
+            r0 += tailstep;
+            r1 += tailstep;
+            r2 += tailstep;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolutiondepthwise_3x3_pack4.h b/src/layer/loongarch/convolutiondepthwise_3x3_pack4.h
new file mode 100644
index 00000000000..48ae66412fc
--- /dev/null
+++ b/src/layer/loongarch/convolutiondepthwise_3x3_pack4.h
@@ -0,0 +1,464 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convdw3x3s1_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int group = bottom_blob.c;
+
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        Mat out = top_blob.channel(g);
+
+        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+
+        const float* k0 = kernel.row(g);
+
+        float* outptr0 = out.row(0);
+        float* outptr1 = out.row(1);
+
+        const Mat img0 = bottom_blob.channel(g);
+
+        const float* r0 = img0.row(0);
+        const float* r1 = img0.row(1);
+        const float* r2 = img0.row(2);
+        const float* r3 = img0.row(3);
+
+        __m128 _k00 = (__m128)__lsx_vld(k0, 0);
+        __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
+        __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+        __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+        __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+        __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0);
+        __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0);
+        __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0);
+        __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0);
+
+        int i = 0;
+        for (; i + 1 < outh; i += 2)
+        {
+            int j = 0;
+            for (; j + 1 < outw; j += 2)
+            {
+                __builtin_prefetch(r0 + 32);
+                __builtin_prefetch(r1 + 32);
+                __builtin_prefetch(r2 + 32);
+                __builtin_prefetch(r3 + 32);
+
+                __m128 _sum00 = _bias0;
+                __m128 _sum01 = _bias0;
+                __m128 _sum10 = _bias0;
+                __m128 _sum11 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r00, _k00, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r01, _k01, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r02, _k02, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r01, _k00, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r02, _k01, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r03, _k02, _sum01);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r10, _k10, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r11, _k11, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r12, _k12, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r11, _k10, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r12, _k11, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r13, _k12, _sum01);
+                _sum10 = __lsx_vfmadd_s(_r10, _k00, _sum10);
+                _sum10 = __lsx_vfmadd_s(_r11, _k01, _sum10);
+                _sum10 = __lsx_vfmadd_s(_r12, _k02, _sum10);
+                _sum11 = __lsx_vfmadd_s(_r11, _k00, _sum11);
+                _sum11 = __lsx_vfmadd_s(_r12, _k01, _sum11);
+                _sum11 = __lsx_vfmadd_s(_r13, _k02, _sum11);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r20, _k20, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r21, _k21, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r22, _k22, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r21, _k20, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r22, _k21, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r23, _k22, _sum01);
+                _sum10 = __lsx_vfmadd_s(_r20, _k10, _sum10);
+                _sum10 = __lsx_vfmadd_s(_r21, _k11, _sum10);
+                _sum10 = __lsx_vfmadd_s(_r22, _k12, _sum10);
+                _sum11 = __lsx_vfmadd_s(_r21, _k10, _sum11);
+                _sum11 = __lsx_vfmadd_s(_r22, _k11, _sum11);
+                _sum11 = __lsx_vfmadd_s(_r23, _k12, _sum11);
+
+                __m128 _r30 = (__m128)__lsx_vld(r3, 0);
+                __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0);
+                __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0);
+                __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0);
+
+                _sum10 = __lsx_vfmadd_s(_r30, _k20, _sum10);
+                _sum10 = __lsx_vfmadd_s(_r31, _k21, _sum10);
+                _sum10 = __lsx_vfmadd_s(_r32, _k22, _sum10);
+                _sum11 = __lsx_vfmadd_s(_r31, _k20, _sum11);
+                _sum11 = __lsx_vfmadd_s(_r32, _k21, _sum11);
+                _sum11 = __lsx_vfmadd_s(_r33, _k22, _sum11);
+
+                __lsx_vst(_sum00, outptr0, 0);
+                __lsx_vst(_sum01, outptr0 + 4, 0);
+                __lsx_vst(_sum10, outptr1, 0);
+                __lsx_vst(_sum11, outptr1 + 4, 0);
+
+                outptr0 += 4 * 2;
+                outptr1 += 4 * 2;
+
+                r0 += 4 * 2;
+                r1 += 4 * 2;
+                r2 += 4 * 2;
+                r3 += 4 * 2;
+            }
+            for (; j < outw; j++)
+            {
+                __builtin_prefetch(r0 + 16);
+                __builtin_prefetch(r1 + 16);
+                __builtin_prefetch(r2 + 16);
+                __builtin_prefetch(r3 + 16);
+
+                __m128 _sum0 = _bias0;
+                __m128 _sum1 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);
+                _sum1 = __lsx_vfmadd_s(_r10, _k00, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r11, _k01, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r12, _k02, _sum1);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);
+                _sum1 = __lsx_vfmadd_s(_r20, _k10, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r21, _k11, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r22, _k12, _sum1);
+
+                __m128 _r30 = (__m128)__lsx_vld(r3, 0);
+                __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0);
+                __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0);
+
+                _sum1 = __lsx_vfmadd_s(_r30, _k20, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r31, _k21, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r32, _k22, _sum1);
+
+                __lsx_vst(_sum0, outptr0, 0);
+                __lsx_vst(_sum1, outptr1, 0);
+
+                outptr0 += 4;
+                outptr1 += 4;
+
+                r0 += 4;
+                r1 += 4;
+                r2 += 4;
+                r3 += 4;
+            }
+
+            r0 += 2 * 4 + w * 4;
+            r1 += 2 * 4 + w * 4;
+            r2 += 2 * 4 + w * 4;
+            r3 += 2 * 4 + w * 4;
+
+            outptr0 += outw * 4;
+            outptr1 += outw * 4;
+        }
+        for (; i < outh; i++)
+        {
+            int j = 0;
+            for (; j + 1 < outw; j += 2)
+            {
+                __builtin_prefetch(r0 + 32);
+                __builtin_prefetch(r1 + 32);
+                __builtin_prefetch(r2 + 32);
+
+                __m128 _sum00 = _bias0;
+                __m128 _sum01 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r00, _k00, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r01, _k01, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r02, _k02, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r01, _k00, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r02, _k01, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r03, _k02, _sum01);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r10, _k10, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r11, _k11, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r12, _k12, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r11, _k10, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r12, _k11, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r13, _k12, _sum01);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r20, _k20, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r21, _k21, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r22, _k22, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r21, _k20, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r22, _k21, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r23, _k22, _sum01);
+
+                __lsx_vst(_sum00, outptr0, 0);
+                __lsx_vst(_sum01, outptr0 + 4, 0);
+
+                outptr0 += 4 * 2;
+
+                r0 += 4 * 2;
+                r1 += 4 * 2;
+                r2 += 4 * 2;
+            }
+            for (; j < outw; j++)
+            {
+                __builtin_prefetch(r0 + 16);
+                __builtin_prefetch(r1 + 16);
+                __builtin_prefetch(r2 + 16);
+
+                __m128 _sum0 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);
+
+                __lsx_vst(_sum0, outptr0, 0);
+
+                outptr0 += 4;
+
+                r0 += 4;
+                r1 += 4;
+                r2 += 4;
+            }
+
+            r0 += 2 * 4;
+            r1 += 2 * 4;
+            r2 += 2 * 4;
+        }
+    }
+}
+
+static void convdw3x3s2_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int group = bottom_blob.c;
+
+    const int tailstep = (w - 2 * outw + w) * 4;
+
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        Mat out = top_blob.channel(g);
+
+        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+
+        const float* k0 = kernel.row(g);
+
+        float* outptr0 = out;
+
+        const Mat img0 = bottom_blob.channel(g);
+
+        const float* r0 = img0.row(0);
+        const float* r1 = img0.row(1);
+        const float* r2 = img0.row(2);
+
+        __m128 _k00 = (__m128)__lsx_vld(k0, 0);
+        __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
+        __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+        __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+        __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+        __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0);
+        __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0);
+        __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0);
+        __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0);
+
+        int i = 0;
+        for (; i < outh; i++)
+        {
+            int j = 0;
+            for (; j + 1 < outw; j += 2)
+            {
+                __builtin_prefetch(r0 + 64);
+                __builtin_prefetch(r1 + 64);
+                __builtin_prefetch(r2 + 64);
+
+                __m128 _sum00 = _bias0;
+                __m128 _sum01 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+                __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r00, _k00, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r01, _k01, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r02, _k02, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r02, _k00, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r03, _k01, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r04, _k02, _sum01);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);
+                __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r10, _k10, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r11, _k11, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r12, _k12, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r12, _k10, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r13, _k11, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r14, _k12, _sum01);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);
+                __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0);
+
+                _sum00 = __lsx_vfmadd_s(_r20, _k20, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r21, _k21, _sum00);
+                _sum00 = __lsx_vfmadd_s(_r22, _k22, _sum00);
+                _sum01 = __lsx_vfmadd_s(_r22, _k20, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r23, _k21, _sum01);
+                _sum01 = __lsx_vfmadd_s(_r24, _k22, _sum01);
+
+                __lsx_vst(_sum00, outptr0, 0);
+                __lsx_vst(_sum01, outptr0 + 4, 0);
+
+                outptr0 += 4 * 2;
+
+                r0 += 4 * 4;
+                r1 += 4 * 4;
+                r2 += 4 * 4;
+            }
+            for (; j < outw; j++)
+            {
+                __builtin_prefetch(r0 + 32);
+                __builtin_prefetch(r1 + 32);
+                __builtin_prefetch(r2 + 32);
+
+                __m128 _sum0 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+
+                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);
+
+                __lsx_vst(_sum0, outptr0, 0);
+
+                outptr0 += 4;
+
+                r0 += 4 * 2;
+                r1 += 4 * 2;
+                r2 += 4 * 2;
+            }
+
+            r0 += tailstep;
+            r1 += tailstep;
+            r2 += tailstep;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolutiondepthwise_5x5_pack4.h b/src/layer/loongarch/convolutiondepthwise_5x5_pack4.h
new file mode 100644
index 00000000000..4f94c5e6995
--- /dev/null
+++ b/src/layer/loongarch/convolutiondepthwise_5x5_pack4.h
@@ -0,0 +1,511 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convdw5x5s1_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int group = bottom_blob.c;
+
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        Mat out = top_blob.channel(g);
+
+        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+
+        const float* k0 = kernel.row(g);
+
+        float* outptr0 = out.row(0);
+        float* outptr1 = out.row(1);
+
+        const Mat img0 = bottom_blob.channel(g);
+
+        const float* r0 = img0.row(0);
+        const float* r1 = img0.row(1);
+        const float* r2 = img0.row(2);
+        const float* r3 = img0.row(3);
+        const float* r4 = img0.row(4);
+        const float* r5 = img0.row(5);
+
+        int i = 0;
+        for (; i + 1 < outh; i += 2)
+        {
+            int j = 0;
+            for (; j < outw; j++)
+            {
+                __builtin_prefetch(r0 + 16);
+                __builtin_prefetch(r1 + 16);
+                __builtin_prefetch(r2 + 16);
+                __builtin_prefetch(r3 + 16);
+                __builtin_prefetch(r4 + 16);
+                __builtin_prefetch(r5 + 16);
+
+                __builtin_prefetch(k0 + 400);
+
+                __m128 _sum0 = _bias0;
+                __m128 _sum1 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+                __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);
+
+                __m128 _k00 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k03 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k04 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r03, _k03, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r04, _k04, _sum0);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);
+                __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0);
+
+                _sum1 = __lsx_vfmadd_s(_r10, _k00, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r11, _k01, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r12, _k02, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r13, _k03, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r14, _k04, _sum1);
+
+                __m128 _k10 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k11 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k13 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k14 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r13, _k13, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r14, _k14, _sum0);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);
+                __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0);
+
+                _sum1 = __lsx_vfmadd_s(_r20, _k10, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r21, _k11, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r22, _k12, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r23, _k13, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r24, _k14, _sum1);
+
+                __m128 _k20 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k21 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k23 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k24 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r23, _k23, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r24, _k24, _sum0);
+
+                __m128 _r30 = (__m128)__lsx_vld(r3, 0);
+                __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0);
+                __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0);
+                __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0);
+                __m128 _r34 = (__m128)__lsx_vld(r3 + 4 * 4, 0);
+
+                _sum1 = __lsx_vfmadd_s(_r30, _k20, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r31, _k21, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r32, _k22, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r33, _k23, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r34, _k24, _sum1);
+
+                __m128 _k30 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k31 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k32 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k33 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k34 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r30, _k30, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r31, _k31, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r32, _k32, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r33, _k33, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r34, _k34, _sum0);
+
+                __m128 _r40 = (__m128)__lsx_vld(r4, 0);
+                __m128 _r41 = (__m128)__lsx_vld(r4 + 4, 0);
+                __m128 _r42 = (__m128)__lsx_vld(r4 + 4 * 2, 0);
+                __m128 _r43 = (__m128)__lsx_vld(r4 + 4 * 3, 0);
+                __m128 _r44 = (__m128)__lsx_vld(r4 + 4 * 4, 0);
+
+                _sum1 = __lsx_vfmadd_s(_r40, _k30, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r41, _k31, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r42, _k32, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r43, _k33, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r44, _k34, _sum1);
+
+                __m128 _k40 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k41 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k42 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k43 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k44 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 -= 4 * 20;
+
+                _sum0 = __lsx_vfmadd_s(_r40, _k40, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r41, _k41, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r42, _k42, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r43, _k43, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r44, _k44, _sum0);
+
+                __m128 _r50 = (__m128)__lsx_vld(r5, 0);
+                __m128 _r51 = (__m128)__lsx_vld(r5 + 4, 0);
+                __m128 _r52 = (__m128)__lsx_vld(r5 + 4 * 2, 0);
+                __m128 _r53 = (__m128)__lsx_vld(r5 + 4 * 3, 0);
+                __m128 _r54 = (__m128)__lsx_vld(r5 + 4 * 4, 0);
+
+                _sum1 = __lsx_vfmadd_s(_r50, _k40, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r51, _k41, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r52, _k42, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r53, _k43, _sum1);
+                _sum1 = __lsx_vfmadd_s(_r54, _k44, _sum1);
+
+                __lsx_vst(_sum0, outptr0, 0);
+                __lsx_vst(_sum1, outptr1, 0);
+
+                outptr0 += 4;
+                outptr1 += 4;
+
+                r0 += 4;
+                r1 += 4;
+                r2 += 4;
+                r3 += 4;
+                r4 += 4;
+                r5 += 4;
+            }
+
+            r0 += 4 * 4 + w * 4;
+            r1 += 4 * 4 + w * 4;
+            r2 += 4 * 4 + w * 4;
+            r3 += 4 * 4 + w * 4;
+            r4 += 4 * 4 + w * 4;
+            r5 += 4 * 4 + w * 4;
+
+            outptr0 += outw * 4;
+            outptr1 += outw * 4;
+        }
+        for (; i < outh; i++)
+        {
+            int j = 0;
+            for (; j < outw; j++)
+            {
+                __builtin_prefetch(r0 + 16);
+                __builtin_prefetch(r1 + 16);
+                __builtin_prefetch(r2 + 16);
+                __builtin_prefetch(r3 + 16);
+                __builtin_prefetch(r4 + 16);
+
+                __builtin_prefetch(k0 + 400);
+
+                __m128 _sum0 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+                __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);
+
+                __m128 _k00 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k03 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k04 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r03, _k03, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r04, _k04, _sum0);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);
+                __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0);
+
+                __m128 _k10 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k11 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k13 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k14 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r13, _k13, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r14, _k14, _sum0);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);
+                __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0);
+
+                __m128 _k20 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k21 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k23 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k24 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r23, _k23, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r24, _k24, _sum0);
+
+                __m128 _r30 = (__m128)__lsx_vld(r3, 0);
+                __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0);
+                __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0);
+                __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0);
+                __m128 _r34 = (__m128)__lsx_vld(r3 + 4 * 4, 0);
+
+                __m128 _k30 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k31 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k32 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k33 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k34 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r30, _k30, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r31, _k31, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r32, _k32, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r33, _k33, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r34, _k34, _sum0);
+
+                __m128 _r40 = (__m128)__lsx_vld(r4, 0);
+                __m128 _r41 = (__m128)__lsx_vld(r4 + 4, 0);
+                __m128 _r42 = (__m128)__lsx_vld(r4 + 4 * 2, 0);
+                __m128 _r43 = (__m128)__lsx_vld(r4 + 4 * 3, 0);
+                __m128 _r44 = (__m128)__lsx_vld(r4 + 4 * 4, 0);
+
+                __m128 _k40 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k41 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k42 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k43 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k44 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 -= 4 * 20;
+
+                _sum0 = __lsx_vfmadd_s(_r40, _k40, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r41, _k41, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r42, _k42, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r43, _k43, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r44, _k44, _sum0);
+
+                __lsx_vst(_sum0, outptr0, 0);
+
+                outptr0 += 4;
+
+                r0 += 4;
+                r1 += 4;
+                r2 += 4;
+                r3 += 4;
+                r4 += 4;
+            }
+
+            r0 += 4 * 4;
+            r1 += 4 * 4;
+            r2 += 4 * 4;
+            r3 += 4 * 4;
+            r4 += 4 * 4;
+        }
+    }
+}
+
+static void convdw5x5s2_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int group = bottom_blob.c;
+
+    const int tailstep = (w - 2 * outw + w) * 4;
+
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        Mat out = top_blob.channel(g);
+
+        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
+
+        const float* k0 = kernel.row(g);
+
+        float* outptr0 = out;
+
+        const Mat img0 = bottom_blob.channel(g);
+
+        const float* r0 = img0.row(0);
+        const float* r1 = img0.row(1);
+        const float* r2 = img0.row(2);
+        const float* r3 = img0.row(3);
+        const float* r4 = img0.row(4);
+
+        int i = 0;
+        for (; i < outh; i++)
+        {
+            int j = 0;
+            for (; j < outw; j++)
+            {
+                __builtin_prefetch(r0 + 32);
+                __builtin_prefetch(r1 + 32);
+                __builtin_prefetch(r2 + 32);
+                __builtin_prefetch(r3 + 32);
+                __builtin_prefetch(r4 + 32);
+
+                __builtin_prefetch(k0 + 400);
+
+                __m128 _sum0 = _bias0;
+
+                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
+                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
+                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
+                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
+                __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);
+
+                __m128 _k00 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k03 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k04 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r03, _k03, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r04, _k04, _sum0);
+
+                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
+                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
+                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
+                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);
+                __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0);
+
+                __m128 _k10 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k11 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k13 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k14 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r13, _k13, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r14, _k14, _sum0);
+
+                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
+                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
+                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
+                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);
+                __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0);
+
+                __m128 _k20 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k21 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k23 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k24 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r23, _k23, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r24, _k24, _sum0);
+
+                __m128 _r30 = (__m128)__lsx_vld(r3, 0);
+                __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0);
+                __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0);
+                __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0);
+                __m128 _r34 = (__m128)__lsx_vld(r3 + 4 * 4, 0);
+
+                __m128 _k30 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k31 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k32 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k33 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k34 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 += 4 * 5;
+
+                _sum0 = __lsx_vfmadd_s(_r30, _k30, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r31, _k31, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r32, _k32, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r33, _k33, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r34, _k34, _sum0);
+
+                __m128 _r40 = (__m128)__lsx_vld(r4, 0);
+                __m128 _r41 = (__m128)__lsx_vld(r4 + 4, 0);
+                __m128 _r42 = (__m128)__lsx_vld(r4 + 4 * 2, 0);
+                __m128 _r43 = (__m128)__lsx_vld(r4 + 4 * 3, 0);
+                __m128 _r44 = (__m128)__lsx_vld(r4 + 4 * 4, 0);
+
+                __m128 _k40 = (__m128)__lsx_vld(k0, 0);
+                __m128 _k41 = (__m128)__lsx_vld(k0 + 4, 0);
+                __m128 _k42 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
+                __m128 _k43 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
+                __m128 _k44 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
+                k0 -= 4 * 20;
+
+                _sum0 = __lsx_vfmadd_s(_r40, _k40, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r41, _k41, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r42, _k42, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r43, _k43, _sum0);
+                _sum0 = __lsx_vfmadd_s(_r44, _k44, _sum0);
+
+                __lsx_vst(_sum0, outptr0, 0);
+
+                outptr0 += 4;
+
+                r0 += 4 * 2;
+                r1 += 4 * 2;
+                r2 += 4 * 2;
+                r3 += 4 * 2;
+                r4 += 4 * 2;
+            }
+
+            r0 += tailstep;
+            r1 += tailstep;
+            r2 += tailstep;
+            r3 += tailstep;
+            r4 += tailstep;
+        }
+    }
+}
diff --git a/src/layer/loongarch/convolutiondepthwise_loongarch.cpp b/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
new file mode 100644
index 00000000000..4d134cc4a39
--- /dev/null
+++ b/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
@@ -0,0 +1,966 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolutiondepthwise_loongarch.h"
+
+#include "layer_type.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_activation.h"
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+#include "convolutiondepthwise_3x3.h"
+
+#if __loongarch_sx
+#include "convolutiondepthwise_3x3_pack4.h"
+#include "convolutiondepthwise_5x5_pack4.h"
+#endif // __loongarch_sx
+
+ConvolutionDepthWise_loongarch::ConvolutionDepthWise_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+
+    activation = 0;
+}
+
+int ConvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
+{
+    if (dynamic_weight)
+        return 0;
+
+    activation = create_activation_layer(activation_type, activation_params, opt);
+
+#if NCNN_INT8
+    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
+    {
+        return create_pipeline_int8_loongarch(opt);
+    }
+#endif
+
+    const int maxk = kernel_w * kernel_h;
+    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
+
+    // depth-wise
+    if (channels == group && group == num_output)
+    {
+        int elempack = 1;
+#if __loongarch_sx
+        if (opt.use_packing_layout)
+        {
+            elempack = channels % 4 == 0 ? 4 : 1;
+        }
+#endif
+
+#if __loongarch_sx
+        // pack4
+        if (elempack == 4)
+        {
+            Mat weight_data_r2 = weight_data.reshape(maxk, group);
+            convert_packing(weight_data_r2, weight_data_tm, 4, opt);
+        }
+#endif // __loongarch_sx
+
+        if (elempack == 1)
+        {
+            weight_data_tm = weight_data;
+        }
+
+        if (opt.lightmode)
+        {
+            weight_data.release();
+        }
+
+        return 0;
+    }
+
+    // group convolution
+    create_group_ops(opt);
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int ConvolutionDepthWise_loongarch::create_group_ops(const Option& opt)
+{
+    // create Convolution op for each group
+    const int maxk = kernel_w * kernel_h;
+    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
+
+    for (int i = 0; i < (int)group_ops.size(); i++)
+        delete group_ops[i];
+
+    group_ops.clear();
+
+    const int channels_g = channels / group;
+    const int num_output_g = num_output / group;
+
+    group_ops.resize(group);
+
+    for (int g = 0; g < group; g++)
+    {
+        Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone();
+        Mat bias_data_g;
+        if (bias_term)
+            bias_data_g = bias_data.range(num_output_g * g, num_output_g);
+
+        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+
+        // set param
+        ncnn::ParamDict pd;
+        pd.set(0, num_output_g); // num_output
+        pd.set(1, kernel_w);
+        pd.set(11, kernel_h);
+        pd.set(2, dilation_w);
+        pd.set(12, dilation_h);
+        pd.set(3, stride_w);
+        pd.set(13, stride_h);
+        pd.set(4, 0);  // pad_w
+        pd.set(14, 0); // pad_h
+        pd.set(5, bias_term);
+        pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
+        pd.set(8, int8_scale_term);
+        pd.set(9, activation_type);
+        pd.set(10, activation_params);
+
+        op->load_param(pd);
+
+        // set weights
+        if (bias_term)
+        {
+            ncnn::Mat weights[5];
+            weights[0] = weight_data_g;
+            weights[1] = bias_data_g;
+
+#if NCNN_INT8
+            if (int8_scale_term)
+            {
+                Mat weight_data_int8_scales_g(num_output_g);
+                weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
+                weights[2] = weight_data_int8_scales_g;
+                weights[3] = bottom_blob_int8_scales.range(g, 1);
+            }
+            if (int8_scale_term > 100)
+            {
+                weights[4] = top_blob_int8_scales.range(g, 1);
+            }
+#endif
+
+            op->load_model(ModelBinFromMatArray(weights));
+        }
+        else
+        {
+            ncnn::Mat weights[4];
+            weights[0] = weight_data_g;
+
+#if NCNN_INT8
+            if (int8_scale_term)
+            {
+                Mat weight_data_int8_scales_g(num_output_g);
+                weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
+                weights[1] = weight_data_int8_scales_g;
+                weights[2] = bottom_blob_int8_scales.range(g, 1);
+            }
+            if (int8_scale_term > 100)
+            {
+                weights[3] = top_blob_int8_scales.range(g, 1);
+            }
+#endif
+
+            op->load_model(ModelBinFromMatArray(weights));
+        }
+
+        op->create_pipeline(opt);
+
+        group_ops[g] = op;
+    }
+
+    return 0;
+}
+
+int ConvolutionDepthWise_loongarch::destroy_pipeline(const Option& opt)
+{
+    if (activation)
+    {
+        activation->destroy_pipeline(opt);
+        delete activation;
+        activation = 0;
+    }
+
+    for (int i = 0; i < (int)group_ops.size(); i++)
+    {
+        group_ops[i]->destroy_pipeline(opt);
+        delete group_ops[i];
+    }
+    group_ops.clear();
+
+    return 0;
+}
+
+int ConvolutionDepthWise_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+#if NCNN_INT8
+    if (opt.use_int8_inference && int8_scale_term)
+    {
+        return forward_int8_loongarch(bottom_blob, top_blob, opt);
+    }
+#endif
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    w = bottom_blob_bordered.w;
+    h = bottom_blob_bordered.h;
+
+    int outw = (w - kernel_extent_w) / stride_w + 1;
+    int outh = (h - kernel_extent_h) / stride_h + 1;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    // depth-wise
+    if (channels * elempack == group && group == num_output)
+    {
+#if __loongarch_sx
+        if (elempack == 4)
+        {
+            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+            {
+                convdw3x3s1_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+                if (activation)
+                {
+                    activation->forward_inplace(top_blob, opt);
+                }
+            }
+            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+            {
+                convdw3x3s2_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+                if (activation)
+                {
+                    activation->forward_inplace(top_blob, opt);
+                }
+            }
+            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+            {
+                convdw5x5s1_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+                if (activation)
+                {
+                    activation->forward_inplace(top_blob, opt);
+                }
+            }
+            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+            {
+                convdw5x5s2_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+                if (activation)
+                {
+                    activation->forward_inplace(top_blob, opt);
+                }
+            }
+            else
+            {
+                const int maxk = kernel_w * kernel_h;
+
+                // kernel offsets
+                std::vector<int> _space_ofs(maxk);
+                int* space_ofs = &_space_ofs[0];
+                {
+                    int p1 = 0;
+                    int p2 = 0;
+                    int gap = w * dilation_h - kernel_w * dilation_w;
+                    for (int i = 0; i < kernel_h; i++)
+                    {
+                        for (int j = 0; j < kernel_w; j++)
+                        {
+                            space_ofs[p1] = p2;
+                            p1++;
+                            p2 += dilation_w;
+                        }
+                        p2 += gap;
+                    }
+                }
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int g = 0; g < channels; g++)
+                {
+                    float* outptr = top_blob.channel(g);
+                    const float* kptr = (const float*)weight_data_tm + maxk * g * 4;
+                    const Mat m = bottom_blob_bordered.channel(g);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                            if (bias_term)
+                            {
+                                _sum = (__m128)__lsx_vld((const float*)bias_data + g * 4, 0);
+                            }
+
+                            const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0);
+                                __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0);
+                                _sum = __lsx_vfmadd_s(_w, _val, _sum);
+                            }
+
+                            _sum = activation_ps(_sum, activation_type, activation_params);
+
+                            __lsx_vst(_sum, outptr + j * 4, 0);
+                        }
+
+                        outptr += outw * 4;
+                    }
+                }
+            }
+        }
+#endif // __loongarch_sx
+
+        if (elempack == 1)
+        {
+            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+            {
+                convdw3x3s1_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+                if (activation)
+                {
+                    activation->forward_inplace(top_blob, opt);
+                }
+            }
+            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+            {
+                convdw3x3s2_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
+
+                if (activation)
+                {
+                    activation->forward_inplace(top_blob, opt);
+                }
+            }
+            else
+            {
+                const int maxk = kernel_w * kernel_h;
+
+                // kernel offsets
+                std::vector<int> _space_ofs(maxk);
+                int* space_ofs = &_space_ofs[0];
+                {
+                    int p1 = 0;
+                    int p2 = 0;
+                    int gap = w * dilation_h - kernel_w * dilation_w;
+                    for (int i = 0; i < kernel_h; i++)
+                    {
+                        for (int j = 0; j < kernel_w; j++)
+                        {
+                            space_ofs[p1] = p2;
+                            p1++;
+                            p2 += dilation_w;
+                        }
+                        p2 += gap;
+                    }
+                }
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int g = 0; g < group; g++)
+                {
+                    float* outptr = top_blob.channel(g);
+                    const float* kptr = (const float*)weight_data_tm + maxk * g;
+                    const Mat m = bottom_blob_bordered.channel(g);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            float sum = 0.f;
+
+                            if (bias_term)
+                                sum = bias_data[g];
+
+                            const float* sptr = m.row(i * stride_h) + j * stride_w;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                float val = (float)sptr[space_ofs[k]];
+                                float w = (float)kptr[k];
+                                sum += val * w;
+                            }
+
+                            sum = activation_ss(sum, activation_type, activation_params);
+
+                            outptr[j] = sum;
+                        }
+
+                        outptr += outw;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    // group convolution
+    const int channels_g = channels * elempack / group;
+    const int num_output_g = num_output / group;
+
+    int g_elempack = 1;
+    int out_g_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        g_elempack = channels_g % 4 == 0 ? 4 : 1;
+        out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
+    }
+#endif
+
+    // unpacking
+    Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
+    if (elempack > g_elempack)
+    {
+        Option opt_p = opt;
+        opt_p.blob_allocator = opt.workspace_allocator;
+        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p);
+    }
+
+    Mat top_blob_unpacked = top_blob;
+    if (out_g_elempack < out_elempack)
+    {
+        top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
+        if (top_blob_unpacked.empty())
+            return -100;
+    }
+
+    for (int g = 0; g < group; g++)
+    {
+        const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
+        Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
+
+        const ncnn::Layer* op = group_ops[g];
+
+        Option opt_g = opt;
+        opt_g.blob_allocator = top_blob_unpacked.allocator;
+
+        // forward
+        op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
+    }
+
+    // packing
+    if (out_g_elempack < out_elempack)
+    {
+        convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+    }
+    else
+    {
+        top_blob = top_blob_unpacked;
+    }
+
+    return 0;
+}
+
+int ConvolutionDepthWise_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& _weight_data = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    const int _kernel_w = _weight_data.w;
+    const int _kernel_h = _weight_data.h;
+    const int _num_output = _weight_data.c * _weight_data.elempack;
+
+    Mat weight_data_flattened;
+    flatten(_weight_data, weight_data_flattened, opt);
+    if (weight_data_flattened.empty())
+        return -100;
+
+    // weight_data_flattened as pack1
+    weight_data_flattened.w *= weight_data_flattened.elempack;
+    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+    weight_data_flattened.elempack = 1;
+
+    Mat bias_data_flattened;
+    if (bias_term)
+    {
+        const Mat& _bias_data = bottom_blobs[2];
+        flatten(_bias_data, bias_data_flattened, opt);
+        if (bias_data_flattened.empty())
+            return -100;
+
+        // bias_data_flattened as pack1
+        bias_data_flattened.w *= bias_data_flattened.elempack;
+        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
+        bias_data_flattened.elempack = 1;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
+
+    ncnn::ParamDict pd;
+    pd.set(0, _num_output);
+    pd.set(1, _kernel_w);
+    pd.set(11, _kernel_h);
+    pd.set(2, dilation_w);
+    pd.set(12, dilation_h);
+    pd.set(3, stride_w);
+    pd.set(13, stride_h);
+    pd.set(4, pad_left);
+    pd.set(15, pad_right);
+    pd.set(14, pad_top);
+    pd.set(16, pad_bottom);
+    pd.set(18, pad_value);
+    pd.set(5, bias_term);
+    pd.set(6, weight_data_flattened.w);
+    pd.set(7, group);
+    pd.set(8, int8_scale_term);
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    op->load_param(pd);
+
+    ncnn::Mat weights[2];
+    weights[0] = weight_data_flattened;
+    weights[1] = bias_data_flattened;
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    op->create_pipeline(opt);
+
+    op->forward(bottom_blob, top_blob, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
+#if NCNN_INT8
+int ConvolutionDepthWise_loongarch::create_pipeline_int8_loongarch(const Option& opt)
+{
+    const int maxk = kernel_w * kernel_h;
+    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
+
+    // depth-wise
+    if (channels == group && group == num_output)
+    {
+        int elempack = 1;
+#if __loongarch_sx
+        if (opt.use_packing_layout)
+        {
+            elempack = channels % 8 == 0 ? 8 : 1;
+        }
+#endif // __loongarch_sx
+
+        if (elempack == 8)
+        {
+            Mat weight_data_r2 = weight_data.reshape(maxk, group);
+            convert_packing(weight_data_r2, weight_data_tm, 8, opt);
+        }
+
+        if (elempack == 1)
+        {
+            weight_data_tm = weight_data;
+        }
+
+        return 0;
+    }
+
+    // group convolution
+    create_group_ops(opt);
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int ConvolutionDepthWise_loongarch::forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int elempack = bottom_blob.elempack;
+
+    int elembits = bottom_blob.elembits();
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    Mat bottom_blob_int8 = bottom_blob;
+    if (elembits != 8)
+    {
+        const int channels_g = channels * elempack / group;
+
+        Mat scales(channels * elempack);
+        {
+            float* ps = scales;
+            for (int g = 0; g < group; g++)
+            {
+                float scale = bottom_blob_int8_scales[g];
+                for (int q = 0; q < channels_g; q++)
+                {
+                    *ps++ = scale;
+                }
+            }
+        }
+
+        Option opt_q = opt;
+        opt_q.blob_allocator = opt.workspace_allocator;
+        quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q);
+    }
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    w = bottom_blob_bordered.w;
+    h = bottom_blob_bordered.h;
+    channels = bottom_blob_bordered.c;
+    elempack = bottom_blob_bordered.elempack;
+
+    int outw = (w - kernel_extent_w) / stride_w + 1;
+    int outh = (h - kernel_extent_h) / stride_h + 1;
+
+    // depth-wise
+    if (channels * elempack == group && group == num_output)
+    {
+        int out_elempack = 1;
+#if __loongarch_sx
+        if (opt.use_packing_layout)
+        {
+            out_elempack = num_output % 8 == 0 ? 8 : 1;
+        }
+#endif // __loongarch_sx
+        bool use_int8_requantize = int8_scale_term > 100;
+        size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
+
+        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+#if __loongarch_sx
+        if (elempack == 8)
+        {
+            {
+                const int maxk = kernel_w * kernel_h;
+
+                // kernel offsets
+                std::vector<int> _space_ofs(maxk);
+                int* space_ofs = &_space_ofs[0];
+                {
+                    int p1 = 0;
+                    int p2 = 0;
+                    int gap = w * dilation_h - kernel_w * dilation_w;
+                    for (int i = 0; i < kernel_h; i++)
+                    {
+                        for (int j = 0; j < kernel_w; j++)
+                        {
+                            space_ofs[p1] = p2;
+                            p1++;
+                            p2 += dilation_w;
+                        }
+                        p2 += gap;
+                    }
+                }
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int g = 0; g < channels; g++)
+                {
+                    signed char* outptr_s8 = top_blob.channel(g);
+                    float* outptr_f32 = top_blob.channel(g);
+                    const signed char* kptr = (const signed char*)weight_data_tm + maxk * g * 8;
+                    const Mat m = bottom_blob_bordered.channel(g);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                            __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+                            const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                __m128i _val = __lsx_vld(sptr + space_ofs[k] * 8, 0);
+                                __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);
+
+                                __m128i _w = __lsx_vld(kptr + k * 8, 0);
+                                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                                __m128i _s0 = __lsx_vmul_h(_val16, _w16);
+                                __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                                __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                                __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+
+                                _sum0 = __lsx_vadd_w(_sum0, _s0l);
+                                _sum1 = __lsx_vadd_w(_sum1, _s0h);
+                            }
+
+                            __m128 _scale_in0;
+                            __m128 _scale_in1;
+                            {
+                                __m128 _bottom_blob_int8_scales0 = (__m128)__lsx_vld((const float*)bottom_blob_int8_scales + g * 8, 0);
+                                __m128 _bottom_blob_int8_scales1 = (__m128)__lsx_vld((const float*)bottom_blob_int8_scales + g * 8 + 4, 0);
+                                __m128 _weight_data_int8_scales0 = (__m128)__lsx_vld((const float*)weight_data_int8_scales + g * 8, 0);
+                                __m128 _weight_data_int8_scales1 = (__m128)__lsx_vld((const float*)weight_data_int8_scales + g * 8 + 4, 0);
+                                _scale_in0 = __lsx_vfrecip_s(__lsx_vfmul_s(_bottom_blob_int8_scales0, _weight_data_int8_scales0));
+                                _scale_in1 = __lsx_vfrecip_s(__lsx_vfmul_s(_bottom_blob_int8_scales1, _weight_data_int8_scales1));
+
+                                __m128i _m0 = __lsx_vfcmp_cne_s(_weight_data_int8_scales0, __lsx_vreplfr2vr_s(0.f));
+                                __m128i _m1 = __lsx_vfcmp_cne_s(_weight_data_int8_scales1, __lsx_vreplfr2vr_s(0.f));
+                                _scale_in0 = (__m128)__lsx_vand_v((__m128i)_scale_in0, (__m128i)_m0);
+                                _scale_in1 = (__m128)__lsx_vand_v((__m128i)_scale_in1, (__m128i)_m1);
+                            }
+
+                            __m128 _sumfp32_0 = __lsx_vfmul_s(__lsx_vffint_s_w(_sum0), _scale_in0);
+                            __m128 _sumfp32_1 = __lsx_vfmul_s(__lsx_vffint_s_w(_sum1), _scale_in1);
+
+                            if (bias_term)
+                            {
+                                __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + g * 8, 0);
+                                __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + g * 8 + 4, 0);
+                                _sumfp32_0 = __lsx_vfadd_s(_sumfp32_0, _bias0);
+                                _sumfp32_1 = __lsx_vfadd_s(_sumfp32_1, _bias1);
+                            }
+
+                            _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params);
+                            _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params);
+
+                            if (use_int8_requantize)
+                            {
+                                // requantize and relu
+                                __m128 _scale_out0 = (__m128)__lsx_vld((const float*)top_blob_int8_scales + g * 8, 0);
+                                __m128 _scale_out1 = (__m128)__lsx_vld((const float*)top_blob_int8_scales + g * 8 + 4, 0);
+                                _sumfp32_0 = __lsx_vfmul_s(_sumfp32_0, _scale_out0);
+                                _sumfp32_1 = __lsx_vfmul_s(_sumfp32_1, _scale_out1);
+                                int64_t _sum8 = float2int8(_sumfp32_0, _sumfp32_1);
+
+                                *(int64_t*)outptr_s8 = _sum8;
+                                outptr_s8 += 8;
+                            }
+                            else
+                            {
+                                // dequantize and relu
+                                __lsx_vst(_sumfp32_0, outptr_f32, 0);
+                                __lsx_vst(_sumfp32_1, outptr_f32 + 4, 0);
+                                outptr_f32 += 8;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+#endif // __loongarch_sx
+
+        if (elempack == 1)
+        {
+            {
+                const int maxk = kernel_w * kernel_h;
+
+                // kernel offsets
+                std::vector<int> _space_ofs(maxk);
+                int* space_ofs = &_space_ofs[0];
+                {
+                    int p1 = 0;
+                    int p2 = 0;
+                    int gap = w * dilation_h - kernel_w * dilation_w;
+                    for (int i = 0; i < kernel_h; i++)
+                    {
+                        for (int j = 0; j < kernel_w; j++)
+                        {
+                            space_ofs[p1] = p2;
+                            p1++;
+                            p2 += dilation_w;
+                        }
+                        p2 += gap;
+                    }
+                }
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int g = 0; g < group; g++)
+                {
+                    signed char* outptr_s8 = top_blob.channel(g);
+                    float* outptr_f32 = top_blob.channel(g);
+                    const signed char* kptr = (const signed char*)weight_data_tm + maxk * g;
+                    const Mat m = bottom_blob_bordered.channel(g);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            int sum = 0;
+
+                            const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                signed char val = sptr[space_ofs[k]];
+                                signed char w = kptr[k];
+                                sum += val * w;
+                            }
+
+                            float scale_in;
+                            if (weight_data_int8_scales[g] == 0)
+                                scale_in = 0;
+                            else
+                                scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
+
+                            float sumfp32 = sum * scale_in;
+
+                            if (bias_term)
+                                sumfp32 += bias_data[g];
+
+                            sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
+
+                            if (use_int8_requantize)
+                            {
+                                // requantize
+                                float scale_out = top_blob_int8_scales[g];
+                                signed char sums8 = float2int8(sumfp32 * scale_out);
+                                outptr_s8[0] = sums8;
+                                outptr_s8 += 1;
+                            }
+                            else
+                            {
+                                // dequantize
+                                outptr_f32[0] = sumfp32;
+                                outptr_f32 += 1;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    bool use_int8_requantize = int8_scale_term > 100;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        if (use_int8_requantize)
+            out_elempack = num_output % 8 == 0 ? 8 : 1;
+        else
+            out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __loongarch_sx
+    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
+
+    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    // group convolution
+    const int channels_g = channels * elempack / group;
+    const int num_output_g = num_output / group;
+
+    int g_elempack = 1;
+    int out_g_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        g_elempack = channels_g % 8 == 0 ? 8 : 1;
+        if (use_int8_requantize)
+            out_g_elempack = num_output_g % 8 == 0 ? 8 : 1;
+        else
+            out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
+    }
+#endif // __loongarch_sx
+
+    // unpacking
+    Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
+    if (elempack > g_elempack)
+    {
+        Option opt_p = opt;
+        opt_p.blob_allocator = opt.workspace_allocator;
+        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
+    }
+
+    Mat top_blob_unpacked = top_blob;
+    if (out_g_elempack < out_elempack)
+    {
+        top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
+        if (top_blob_unpacked.empty())
+            return -100;
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
+        Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
+
+        const ncnn::Layer* op = group_ops[g];
+
+        Option opt_g = opt;
+        opt_g.blob_allocator = top_blob_unpacked.allocator;
+
+        // forward
+        op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
+    }
+
+    // packing
+    if (out_g_elempack < out_elempack)
+    {
+        convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+    }
+    else
+    {
+        top_blob = top_blob_unpacked;
+    }
+
+    return 0;
+}
+#endif // NCNN_INT8
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/convolutiondepthwise_loongarch.h b/src/layer/loongarch/convolutiondepthwise_loongarch.h
new file mode 100644
index 00000000000..554fe764304
--- /dev/null
+++ b/src/layer/loongarch/convolutiondepthwise_loongarch.h
@@ -0,0 +1,50 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CONVOLUTIONDEPTHWISE_LOONGARCH_H
+#define LAYER_CONVOLUTIONDEPTHWISE_LOONGARCH_H
+
+#include "convolutiondepthwise.h"
+
+namespace ncnn {
+
+class ConvolutionDepthWise_loongarch : virtual public ConvolutionDepthWise
+{
+public:
+    ConvolutionDepthWise_loongarch();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+protected:
+    int create_group_ops(const Option& opt);
+#if NCNN_INT8
+    int create_pipeline_int8_loongarch(const Option& opt);
+    int forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
+
+public:
+    Layer* activation;
+    std::vector<ncnn::Layer*> group_ops;
+
+    Mat weight_data_tm;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CONVOLUTIONDEPTHWISE_LOONGARCH_H
diff --git a/src/layer/loongarch/crop_loongarch.cpp b/src/layer/loongarch/crop_loongarch.cpp
new file mode 100644
index 00000000000..e7c588bc476
--- /dev/null
+++ b/src/layer/loongarch/crop_loongarch.cpp
@@ -0,0 +1,399 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "crop_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+Crop_loongarch::Crop_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+#if __loongarch_sx
+static void crop_pack4_lsx(const Mat& src, Mat& dst, int top, int left)
+{
+    int w = dst.w;
+    int h = dst.h;
+    int right = src.w - dst.w - left;
+
+    const float* ptr = src.row(top) + left * 4;
+    float* outptr = dst;
+
+    for (int y = 0; y < h; y++)
+    {
+        for (int x = 0; x < w; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __lsx_vst(_p, outptr, 0);
+
+            ptr += 4;
+            outptr += 4;
+        }
+
+        ptr += (left + right) * 4;
+    }
+}
+#endif // __loongarch_sx
+
+int Crop_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+#if __loongarch_sx
+    if (elempack == 4)
+    {
+        int _woffset, _hoffset, _doffset, _coffset;
+        int _outw, _outh, _outd, _outc;
+        resolve_crop_roi(bottom_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
+
+        if (dims == 1)
+        {
+            int out_elempack = _outw % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (_outw / out_elempack == w && out_elempack == 4)
+            {
+                top_blob = bottom_blob;
+                return 0;
+            }
+
+            if (_woffset % 4 == 0 && out_elempack == 4)
+            {
+                top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                crop_pack4_lsx(bottom_blob, top_blob, 0, _woffset / elempack);
+
+                return 0;
+            }
+        }
+
+        if (dims == 2)
+        {
+            int out_elempack = _outh % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (_outw == w && _outh / out_elempack == h && out_elempack == 4)
+            {
+                top_blob = bottom_blob;
+                return 0;
+            }
+
+            if (_hoffset % 4 == 0 && out_elempack == 4)
+            {
+                top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                crop_pack4_lsx(bottom_blob, top_blob, _hoffset / elempack, _woffset);
+
+                return 0;
+            }
+        }
+
+        if (dims == 3)
+        {
+            int out_elempack = _outc % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 4)
+            {
+                top_blob = bottom_blob;
+                return 0;
+            }
+
+            if (_coffset % 4 == 0 && out_elempack == 4)
+            {
+                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
+
+                if (_outw == w && _outh == h)
+                {
+                    top_blob = bottom_blob_sliced.clone();
+                    if (top_blob.empty())
+                        return -100;
+                }
+
+                top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < top_blob.c; q++)
+                {
+                    const Mat m = bottom_blob_sliced.channel(q);
+                    Mat borderm = top_blob.channel(q);
+
+                    crop_pack4_lsx(m, borderm, _hoffset, _woffset);
+                }
+
+                return 0;
+            }
+        }
+
+        if (dims == 4)
+        {
+            int out_elempack = _outc % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 4)
+            {
+                top_blob = bottom_blob;
+                return 0;
+            }
+
+            if (_coffset % 4 == 0 && out_elempack == 4)
+            {
+                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
+
+                if (_outw == w && _outh == h && _outd == d)
+                {
+                    top_blob = bottom_blob_sliced.clone();
+                    if (top_blob.empty())
+                        return -100;
+                }
+
+                top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < top_blob.c; q++)
+                {
+                    for (int z = 0; z < _outd; z++)
+                    {
+                        const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset);
+                        Mat borderm = top_blob.channel(q).depth(z);
+
+                        crop_pack4_lsx(m, borderm, _hoffset, _woffset);
+                    }
+                }
+
+                return 0;
+            }
+        }
+    }
+#endif // __loongarch_sx
+
+    Mat bottom_blob_unpacked = bottom_blob;
+    if (elempack != 1)
+    {
+        Option opt_pack1 = opt;
+        opt_pack1.blob_allocator = opt.workspace_allocator;
+
+        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
+    }
+
+    return Crop::forward(bottom_blob_unpacked, top_blob, opt);
+}
+
+int Crop_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& reference_blob = bottom_blobs[1];
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int ref_elempack = reference_blob.elempack;
+
+    Mat& top_blob = top_blobs[0];
+
+#if __loongarch_sx
+    if (elempack == 4)
+    {
+        int _woffset, _hoffset, _doffset, _coffset;
+        int _outw, _outh, _outd, _outc;
+        if (woffset == -233)
+        {
+            resolve_crop_roi(bottom_blob.shape(), (const int*)reference_blob, _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
+        }
+        else
+        {
+            resolve_crop_roi(bottom_blob.shape(), reference_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
+        }
+
+        if (dims == 1)
+        {
+            int out_elempack = _outw % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (_outw / out_elempack == w && out_elempack == 4)
+            {
+                top_blob = bottom_blob;
+                return 0;
+            }
+
+            if (_woffset % 4 == 0 && out_elempack == 4)
+            {
+                top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                crop_pack4_lsx(bottom_blob, top_blob, 0, _woffset / elempack);
+
+                return 0;
+            }
+        }
+
+        if (dims == 2)
+        {
+            int out_elempack = _outh % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (_outw == w && _outh / out_elempack == h && out_elempack == 4)
+            {
+                top_blob = bottom_blob;
+                return 0;
+            }
+
+            if (_hoffset % 4 == 0 && out_elempack == 4)
+            {
+                top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                crop_pack4_lsx(bottom_blob, top_blob, _hoffset / elempack, _woffset);
+
+                return 0;
+            }
+        }
+
+        if (dims == 3)
+        {
+            int out_elempack = _outc % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 4)
+            {
+                top_blob = bottom_blob;
+                return 0;
+            }
+
+            if (_coffset % 4 == 0 && out_elempack == 4)
+            {
+                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
+
+                if (_outw == w && _outh == h)
+                {
+                    top_blob = bottom_blob_sliced.clone();
+                    if (top_blob.empty())
+                        return -100;
+                }
+
+                top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < top_blob.c; q++)
+                {
+                    const Mat m = bottom_blob_sliced.channel(q);
+                    Mat borderm = top_blob.channel(q);
+
+                    crop_pack4_lsx(m, borderm, _hoffset, _woffset);
+                }
+
+                return 0;
+            }
+        }
+
+        if (dims == 4)
+        {
+            int out_elempack = _outc % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 4)
+            {
+                top_blob = bottom_blob;
+                return 0;
+            }
+
+            if (_coffset % 4 == 0 && out_elempack == 4)
+            {
+                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
+
+                if (_outw == w && _outh == h && _outd == d)
+                {
+                    top_blob = bottom_blob_sliced.clone();
+                    if (top_blob.empty())
+                        return -100;
+                }
+
+                top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < top_blob.c; q++)
+                {
+                    for (int z = 0; z < _outd; z++)
+                    {
+                        const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset);
+                        Mat borderm = top_blob.channel(q).depth(z);
+
+                        crop_pack4_lsx(m, borderm, _hoffset, _woffset);
+                    }
+                }
+
+                return 0;
+            }
+        }
+    }
+#endif // __loongarch_sx
+
+    Mat bottom_blob_unpacked = bottom_blob;
+    if (elempack != 1)
+    {
+        Option opt_pack1 = opt;
+        opt_pack1.blob_allocator = opt.workspace_allocator;
+
+        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
+    }
+
+    Mat reference_blob_unpacked = reference_blob;
+    if (ref_elempack != 1)
+    {
+        Option opt_pack1 = opt;
+        opt_pack1.blob_allocator = opt.workspace_allocator;
+
+        convert_packing(reference_blob, reference_blob_unpacked, 1, opt_pack1);
+    }
+
+    std::vector<Mat> bottom_blobs_unpacked(2);
+    bottom_blobs_unpacked[0] = bottom_blob_unpacked;
+    bottom_blobs_unpacked[1] = reference_blob_unpacked;
+
+    return Crop::forward(bottom_blobs_unpacked, top_blobs, opt);
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/crop_loongarch.h b/src/layer/loongarch/crop_loongarch.h
new file mode 100644
index 00000000000..0ba460256d6
--- /dev/null
+++ b/src/layer/loongarch/crop_loongarch.h
@@ -0,0 +1,34 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CROP_LOONGARCH_H
+#define LAYER_CROP_LOONGARCH_H
+
+#include "crop.h"
+
+namespace ncnn {
+
+class Crop_loongarch : virtual public Crop
+{
+public:
+    Crop_loongarch();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CROP_LOONGARCH_H
diff --git a/src/layer/loongarch/deconvolution_loongarch.cpp b/src/layer/loongarch/deconvolution_loongarch.cpp
new file mode 100644
index 00000000000..bb913909b55
--- /dev/null
+++ b/src/layer/loongarch/deconvolution_loongarch.cpp
@@ -0,0 +1,284 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "deconvolution_loongarch.h"
+
+#include "layer_type.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_activation.h"
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+#if __loongarch_sx
+#include "deconvolution_pack4.h"
+#include "deconvolution_pack1to4.h"
+#include "deconvolution_pack4to1.h"
+#endif // __loongarch_sx
+
+Deconvolution_loongarch::Deconvolution_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Deconvolution_loongarch::create_pipeline(const Option& opt)
+{
+    const int maxk = kernel_w * kernel_h;
+    int num_input = weight_data_size / maxk / num_output;
+
+    Mat weight_data_transposed(weight_data.w);
+    {
+        float* pt = weight_data_transposed;
+        const float* p = weight_data;
+
+        for (int i = 0; i < num_input * num_output; i++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                pt[maxk - 1 - k] = p[k];
+            }
+
+            p += maxk;
+            pt += maxk;
+        }
+    }
+
+    int elempack = 1;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        elempack = num_input % 4 == 0 ? 4 : 1;
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif
+
+    // src = kw-kh-inch-outch
+    // dst = pb-pa-kw-kh-inch/pa-outch/pb
+    {
+        Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);
+
+        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            float* g00 = weight_data_tm.channel(q / out_elempack);
+
+            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
+            {
+                for (int k = 0; k < maxk; k++)
+                {
+                    for (int i = 0; i < elempack; i++)
+                    {
+                        for (int j = 0; j < out_elempack; j++)
+                        {
+                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);
+
+                            g00[0] = k00[k];
+
+                            g00++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+#if __loongarch_sx
+    // pack4
+    if (elempack == 4 && out_elempack == 4)
+    {
+    }
+
+    // pack1ton
+    if (elempack == 1 && out_elempack == 4)
+    {
+    }
+
+    // pack4to1
+    if (elempack == 4 && out_elempack == 1)
+    {
+    }
+#endif // __loongarch_sx
+
+    // pack1
+    if (elempack == 1 && out_elempack == 1)
+    {
+    }
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int Deconvolution_loongarch::destroy_pipeline(const Option& opt)
+{
+    return 0;
+}
+
+int Deconvolution_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    // deconvolv with NxN kernel
+    // value = value + bias
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    //     NCNN_LOGE("Deconvolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
+    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    Mat top_blob_bordered;
+    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
+    {
+        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
+    }
+    else
+    {
+        top_blob_bordered = top_blob;
+        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    }
+    if (top_blob_bordered.empty())
+        return -100;
+
+    const int maxk = kernel_w * kernel_h;
+
+#if __loongarch_sx
+    if (elempack == 4 && out_elempack == 4)
+    {
+        {
+            deconvolution_pack4_lsx(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 1 && out_elempack == 4)
+    {
+        {
+            deconvolution_pack1to4_lsx(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
+        }
+    }
+
+    if (elempack == 4 && out_elempack == 1)
+    {
+        {
+            deconvolution_pack4to1_lsx(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
+        }
+    }
+#endif // __loongarch_sx
+
+    if (elempack == 1 && out_elempack == 1)
+    {
+        {
+            // num_output
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p = 0; p < num_output; p++)
+            {
+                float* outptr = top_blob_bordered.channel(p);
+
+                for (int i = 0; i < outh; i++)
+                {
+                    for (int j = 0; j < outw; j++)
+                    {
+                        float sum = 0.f;
+
+                        if (bias_term)
+                        {
+                            sum = bias_data[p];
+                        }
+
+                        const float* kptr = (const float*)weight_data_tm.channel(p);
+
+                        // channels
+                        for (int q = 0; q < channels; q++)
+                        {
+                            const Mat m = bottom_blob.channel(q);
+
+                            for (int y = 0; y < kernel_h; y++)
+                            {
+                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
+                                if (sys < 0 || sys % stride_h != 0)
+                                    continue;
+
+                                int sy = sys / stride_h;
+                                if (sy >= h)
+                                    continue;
+
+                                const float* sptr = m.row(sy);
+
+                                for (int x = 0; x < kernel_w; x++)
+                                {
+                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
+                                    if (sxs < 0 || sxs % stride_w != 0)
+                                        continue;
+
+                                    int sx = sxs / stride_w;
+                                    if (sx >= w)
+                                        continue;
+
+                                    float val = sptr[sx];
+
+                                    int k = y * kernel_w + x;
+
+                                    float w = kptr[k];
+
+                                    sum += val * w;
+                                }
+                            }
+
+                            kptr += maxk;
+                        }
+
+                        sum = activation_ss(sum, activation_type, activation_params);
+
+                        outptr[j] = sum;
+                    }
+
+                    outptr += outw;
+                }
+            }
+        }
+    }
+
+    cut_padding(top_blob_bordered, top_blob, opt);
+    if (top_blob.empty())
+        return -100;
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/deconvolution_loongarch.h b/src/layer/loongarch/deconvolution_loongarch.h
new file mode 100644
index 00000000000..bb7653b563f
--- /dev/null
+++ b/src/layer/loongarch/deconvolution_loongarch.h
@@ -0,0 +1,38 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_DECONVOLUTION_LOONGARCH_H
+#define LAYER_DECONVOLUTION_LOONGARCH_H
+
+#include "deconvolution.h"
+
+namespace ncnn {
+
+class Deconvolution_loongarch : virtual public Deconvolution
+{
+public:
+    Deconvolution_loongarch();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+public:
+    Mat weight_data_tm;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_DECONVOLUTION_LOONGARCH_H
diff --git a/src/layer/loongarch/deconvolution_pack1to4.h b/src/layer/loongarch/deconvolution_pack1to4.h
new file mode 100644
index 00000000000..ee1f932b57a
--- /dev/null
+++ b/src/layer/loongarch/deconvolution_pack1to4.h
@@ -0,0 +1,99 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deconvolution_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        float* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                if (bias_data_ptr)
+                {
+                    _sum = (__m128)__lsx_vld((const float*)bias_data_ptr + p * 4, 0);
+                }
+
+                const float* kptr = (const float*)weight_data_pack1ton + maxk * channels * p * 4;
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+
+                    for (int y = 0; y < kernel_h; y++)
+                    {
+                        int sys = (i + y * dilation_h - (kernel_extent_h - 1));
+                        if (sys < 0 || sys % stride_h != 0)
+                            continue;
+
+                        int sy = sys / stride_h;
+                        if (sy >= h)
+                            continue;
+
+                        const float* sptr = m.row(sy);
+
+                        for (int x = 0; x < kernel_w; x++)
+                        {
+                            int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
+                            if (sxs < 0 || sxs % stride_w != 0)
+                                continue;
+
+                            int sx = sxs / stride_w;
+                            if (sx >= w)
+                                continue;
+
+                            float val = sptr[sx];
+
+                            int k = y * kernel_w + x;
+
+                            __m128 _val = (__m128)__lsx_vreplfr2vr_s(val);
+                            __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0);
+                            _sum = __lsx_vfmadd_s(_w, _val, _sum);
+                        }
+                    }
+
+                    kptr += maxk * 4;
+                }
+
+                _sum = activation_ps(_sum, activation_type, activation_params);
+
+                __lsx_vst(_sum, outptr + j * 4, 0);
+            }
+
+            outptr += outw * 4;
+        }
+    }
+}
diff --git a/src/layer/loongarch/deconvolution_pack4.h b/src/layer/loongarch/deconvolution_pack4.h
new file mode 100644
index 00000000000..179a410350f
--- /dev/null
+++ b/src/layer/loongarch/deconvolution_pack4.h
@@ -0,0 +1,106 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deconvolution_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        float* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                if (bias_data_ptr)
+                {
+                    _sum = (__m128)__lsx_vld((const float*)bias_data_ptr + p * 4, 0);
+                }
+
+                const float* kptr = (const float*)weight_data_pack4.channel(p);
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+
+                    for (int y = 0; y < kernel_h; y++)
+                    {
+                        int sys = (i + y * dilation_h - (kernel_extent_h - 1));
+                        if (sys < 0 || sys % stride_h != 0)
+                            continue;
+
+                        int sy = sys / stride_h;
+                        if (sy >= h)
+                            continue;
+
+                        for (int x = 0; x < kernel_w; x++)
+                        {
+                            int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
+                            if (sxs < 0 || sxs % stride_w != 0)
+                                continue;
+
+                            int sx = sxs / stride_w;
+                            if (sx >= w)
+                                continue;
+
+                            const float* sptr = m.row(sy) + sx * 4;
+
+                            int k = (y * kernel_w + x) * 16;
+
+                            __m128 _val0 = (__m128)__lsx_vreplfr2vr_s(*sptr++);
+                            __m128 _val1 = (__m128)__lsx_vreplfr2vr_s(*sptr++);
+                            __m128 _val2 = (__m128)__lsx_vreplfr2vr_s(*sptr++);
+                            __m128 _val3 = (__m128)__lsx_vreplfr2vr_s(*sptr++);
+                            __m128 _w0 = (__m128)__lsx_vld(kptr + k, 0);
+                            __m128 _w1 = (__m128)__lsx_vld(kptr + k + 4, 0);
+                            __m128 _w2 = (__m128)__lsx_vld(kptr + k + 8, 0);
+                            __m128 _w3 = (__m128)__lsx_vld(kptr + k + 12, 0);
+                            _sum = __lsx_vfmadd_s(_w0, _val0, _sum);
+                            _sum = __lsx_vfmadd_s(_w1, _val1, _sum);
+                            _sum = __lsx_vfmadd_s(_w2, _val2, _sum);
+                            _sum = __lsx_vfmadd_s(_w3, _val3, _sum);
+                        }
+                    }
+
+                    kptr += maxk * 16;
+                }
+
+                _sum = activation_ps(_sum, activation_type, activation_params);
+
+                __lsx_vst(_sum, outptr + j * 4, 0);
+            }
+
+            outptr += outw * 4;
+        }
+    }
+}
diff --git a/src/layer/loongarch/deconvolution_pack4to1.h b/src/layer/loongarch/deconvolution_pack4to1.h
new file mode 100644
index 00000000000..e13721c2c35
--- /dev/null
+++ b/src/layer/loongarch/deconvolution_pack4to1.h
@@ -0,0 +1,101 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deconvolution_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4to1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    const int maxk = kernel_w * kernel_h;
+
+    const float* bias_data_ptr = bias_data;
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        float* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                float sum = 0.f;
+
+                if (bias_data_ptr)
+                {
+                    sum = bias_data_ptr[p];
+                }
+
+                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                const float* kptr = (const float*)weight_data_pack4to1 + maxk * channels * p * 4;
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+
+                    for (int y = 0; y < kernel_h; y++)
+                    {
+                        int sys = (i + y * dilation_h - (kernel_extent_h - 1));
+                        if (sys < 0 || sys % stride_h != 0)
+                            continue;
+
+                        int sy = sys / stride_h;
+                        if (sy >= h)
+                            continue;
+
+                        for (int x = 0; x < kernel_w; x++)
+                        {
+                            int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
+                            if (sxs < 0 || sxs % stride_w != 0)
+                                continue;
+
+                            int sx = sxs / stride_w;
+                            if (sx >= w)
+                                continue;
+
+                            const float* sptr = m.row(sy) + sx * 4;
+
+                            int k = y * kernel_w + x;
+
+                            __m128 _val = (__m128)__lsx_vld(sptr, 0);
+                            __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0);
+                            _sum = __lsx_vfmadd_s(_w, _val, _sum);
+                        }
+                    }
+
+                    kptr += maxk * 4;
+                }
+
+                sum += __lsx_reduce_fadd_s(_sum);
+
+                sum = activation_ss(sum, activation_type, activation_params);
+
+                outptr[j] = sum;
+            }
+
+            outptr += outw;
+        }
+    }
+}
diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
new file mode 100644
index 00000000000..a141dd70360
--- /dev/null
+++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
@@ -0,0 +1,412 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "deconvolutiondepthwise_loongarch.h"
+
+#include "layer_type.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_activation.h"
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+DeconvolutionDepthWise_loongarch::DeconvolutionDepthWise_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int DeconvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
+{
+    const int maxk = kernel_w * kernel_h;
+    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
+
+    // depth-wise
+    if (channels == group && group == num_output)
+    {
+        int elempack = 1;
+#if __loongarch_sx
+        if (opt.use_packing_layout)
+        {
+            elempack = channels % 4 == 0 ? 4 : 1;
+        }
+#endif
+
+        Mat weight_data_transposed(weight_data.w);
+        {
+            float* pt = weight_data_transposed;
+            const float* p = weight_data;
+
+            for (int i = 0; i < (channels / group) * (num_output / group) * group; i++)
+            {
+                for (int k = 0; k < maxk; k++)
+                {
+                    pt[maxk - 1 - k] = p[k];
+                }
+
+                p += maxk;
+                pt += maxk;
+            }
+        }
+
+#if __loongarch_sx
+        // pack4
+        if (elempack == 4)
+        {
+            Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
+            convert_packing(weight_data_r2, weight_data_tm, 4, opt);
+        }
+#endif // __loongarch_sx
+
+        if (elempack == 1)
+        {
+            weight_data_tm = weight_data_transposed;
+        }
+
+        return 0;
+    }
+
+    // group convolution
+    create_group_ops(opt);
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int DeconvolutionDepthWise_loongarch::create_group_ops(const Option& opt)
+{
+    // create Deconvolution op for each group
+    const int maxk = kernel_w * kernel_h;
+    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
+
+    for (int i = 0; i < (int)group_ops.size(); i++)
+        delete group_ops[i];
+
+    group_ops.clear();
+
+    const int channels_g = channels / group;
+    const int num_output_g = num_output / group;
+
+    group_ops.resize(group);
+
+    for (int g = 0; g < group; g++)
+    {
+        Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone();
+        Mat bias_data_g;
+        if (bias_term)
+            bias_data_g = bias_data.range(num_output_g * g, num_output_g);
+
+        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+
+        // set param
+        ncnn::ParamDict pd;
+        pd.set(0, num_output_g); // num_output
+        pd.set(1, kernel_w);
+        pd.set(11, kernel_h);
+        pd.set(2, dilation_w);
+        pd.set(12, dilation_h);
+        pd.set(3, stride_w);
+        pd.set(13, stride_h);
+        pd.set(4, 0);  // pad_w
+        pd.set(14, 0); // pad_h
+        pd.set(18, output_pad_right);
+        pd.set(19, output_pad_bottom);
+        pd.set(5, bias_term);
+        pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
+        pd.set(9, activation_type);
+        pd.set(10, activation_params);
+
+        op->load_param(pd);
+
+        // set weights
+        if (bias_term)
+        {
+            ncnn::Mat weights[2];
+            weights[0] = weight_data_g;
+            weights[1] = bias_data_g;
+
+            op->load_model(ModelBinFromMatArray(weights));
+        }
+        else
+        {
+            ncnn::Mat weights[1];
+            weights[0] = weight_data_g;
+
+            op->load_model(ModelBinFromMatArray(weights));
+        }
+
+        op->create_pipeline(opt);
+
+        group_ops[g] = op;
+    }
+
+    return 0;
+}
+
+int DeconvolutionDepthWise_loongarch::destroy_pipeline(const Option& opt)
+{
+    for (int i = 0; i < (int)group_ops.size(); i++)
+    {
+        group_ops[i]->destroy_pipeline(opt);
+        delete group_ops[i];
+    }
+    group_ops.clear();
+
+    return 0;
+}
+
+int DeconvolutionDepthWise_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    // convolv with NxN kernel
+    // value = value + bias
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
+    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    Mat top_blob_bordered;
+    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
+    {
+        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
+    }
+    else
+    {
+        top_blob_bordered = top_blob;
+        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    }
+    if (top_blob_bordered.empty())
+        return -100;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // depth-wise
+    if (channels * elempack == group && group == num_output)
+    {
+#if __loongarch_sx
+        if (elempack == 4)
+        {
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int g = 0; g < channels; g++)
+                {
+                    float* outptr = top_blob_bordered.channel(g);
+                    const float* kptr = (const float*)weight_data_tm + maxk * g * 4;
+                    const Mat m = bottom_blob.channel(g);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                            if (bias_term)
+                            {
+                                _sum = (__m128)__lsx_vld((const float*)bias_data + g * 4, 0);
+                            }
+
+                            for (int y = 0; y < kernel_h; y++)
+                            {
+                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
+                                if (sys < 0 || sys % stride_h != 0)
+                                    continue;
+
+                                int sy = sys / stride_h;
+                                if (sy >= h)
+                                    continue;
+
+                                for (int x = 0; x < kernel_w; x++)
+                                {
+                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
+                                    if (sxs < 0 || sxs % stride_w != 0)
+                                        continue;
+
+                                    int sx = sxs / stride_w;
+                                    if (sx >= w)
+                                        continue;
+
+                                    const float* sptr = m.row(sy) + sx * 4;
+
+                                    int k = y * kernel_w + x;
+
+                                    __m128 _val = (__m128)__lsx_vld(sptr, 0);
+                                    __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0);
+                                    _sum = __lsx_vfmadd_s(_w, _val, _sum);
+                                }
+                            }
+
+                            _sum = activation_ps(_sum, activation_type, activation_params);
+
+                            __lsx_vst(_sum, outptr + j * 4, 0);
+                        }
+
+                        outptr += outw * 4;
+                    }
+                }
+            }
+        }
+#endif // __loongarch_sx
+
+        if (elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int g = 0; g < channels; g++)
+            {
+                float* outptr = top_blob_bordered.channel(g);
+                const float* kptr = (const float*)weight_data_tm + maxk * g;
+                const Mat m = bottom_blob.channel(g);
+
+                for (int i = 0; i < outh; i++)
+                {
+                    for (int j = 0; j < outw; j++)
+                    {
+                        float sum = 0.f;
+
+                        if (bias_term)
+                        {
+                            sum = bias_data[g];
+                        }
+
+                        for (int y = 0; y < kernel_h; y++)
+                        {
+                            int sys = (i + y * dilation_h - (kernel_extent_h - 1));
+                            if (sys < 0 || sys % stride_h != 0)
+                                continue;
+
+                            int sy = sys / stride_h;
+                            if (sy >= h)
+                                continue;
+
+                            const float* sptr = m.row(sy);
+
+                            for (int x = 0; x < kernel_w; x++)
+                            {
+                                int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
+                                if (sxs < 0 || sxs % stride_w != 0)
+                                    continue;
+
+                                int sx = sxs / stride_w;
+                                if (sx >= w)
+                                    continue;
+
+                                float val = sptr[sx];
+
+                                int k = y * kernel_w + x;
+
+                                float w = kptr[k];
+
+                                sum += val * w;
+                            }
+                        }
+
+                        sum = activation_ss(sum, activation_type, activation_params);
+
+                        outptr[j] = sum;
+                    }
+
+                    outptr += outw;
+                }
+            }
+        }
+    }
+    else
+    {
+        // group deconvolution
+        const int channels_g = channels * elempack / group;
+        const int num_output_g = num_output / group;
+
+        int g_elempack = 1;
+        int out_g_elempack = 1;
+#if __loongarch_sx
+        if (opt.use_packing_layout)
+        {
+            g_elempack = channels_g % 4 == 0 ? 4 : 1;
+            out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
+        }
+#endif
+
+        // unpacking
+        Mat bottom_blob_unpacked = bottom_blob;
+        if (elempack > g_elempack)
+        {
+            Option opt_p = opt;
+            opt_p.blob_allocator = opt.workspace_allocator;
+            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
+        }
+
+        Mat top_blob_bordered_unpacked = top_blob_bordered;
+        if (out_g_elempack < out_elempack)
+        {
+            top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
+            if (top_blob_bordered_unpacked.empty())
+                return -100;
+        }
+
+        for (int g = 0; g < group; g++)
+        {
+            const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
+            Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
+
+            const ncnn::Layer* op = group_ops[g];
+
+            Option opt_g = opt;
+            opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;
+
+            // forward
+            op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
+        }
+
+        // packing
+        if (out_g_elempack < out_elempack)
+        {
+            convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
+        }
+        else
+        {
+            top_blob_bordered = top_blob_bordered_unpacked;
+        }
+    }
+
+    cut_padding(top_blob_bordered, top_blob, opt);
+    if (top_blob.empty())
+        return -100;
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.h b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h
new file mode 100644
index 00000000000..e41e7cac9e1
--- /dev/null
+++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h
@@ -0,0 +1,43 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_DECONVOLUTIONDEPTHWISE_LOONGARCH_H
+#define LAYER_DECONVOLUTIONDEPTHWISE_LOONGARCH_H
+
+#include "deconvolutiondepthwise.h"
+
+namespace ncnn {
+
+class DeconvolutionDepthWise_loongarch : virtual public DeconvolutionDepthWise
+{
+public:
+    DeconvolutionDepthWise_loongarch();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+protected:
+    int create_group_ops(const Option& opt);
+
+public:
+    std::vector<ncnn::Layer*> group_ops;
+
+    Mat weight_data_tm;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_DECONVOLUTIONDEPTHWISE_LOONGARCH_H
diff --git a/src/layer/loongarch/dequantize_loongarch.cpp b/src/layer/loongarch/dequantize_loongarch.cpp
new file mode 100644
index 00000000000..5ee9595f89f
--- /dev/null
+++ b/src/layer/loongarch/dequantize_loongarch.cpp
@@ -0,0 +1,838 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "dequantize_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+Dequantize_loongarch::Dequantize_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int Dequantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    // assert bottom_blob.elembits() == 32
+
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+
+#if __loongarch_sx
+    if (elempack == 8)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int outw = w * 2;
+
+            top_blob.create(outw, (size_t)16u, 4, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+            }
+            else
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outw; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0);
+                        __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int outh = h * 2;
+
+            top_blob.create(w, outh, (size_t)16u, 4, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr0 = top_blob.row(i * 2);
+                    float* ptr1 = top_blob.row(i * 2 + 1);
+
+                    __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8, 0);
+                    __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8 + 4, 0);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale1);
+                        __lsx_vst(_v0, ptr0, 0);
+                        __lsx_vst(_v1, ptr1, 0);
+
+                        intptr += 8;
+                        ptr0 += 4;
+                        ptr1 += 4;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr0 = top_blob.row(i * 2);
+                    float* ptr1 = top_blob.row(i * 2 + 1);
+
+                    __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8, 0);
+                    __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8 + 4, 0);
+                    __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0);
+                    __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                        __lsx_vst(_v0, ptr0, 0);
+                        __lsx_vst(_v1, ptr1, 0);
+
+                        intptr += 8;
+                        ptr0 += 4;
+                        ptr1 += 4;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int outc = channels * 2;
+
+            top_blob.create(w, h, outc, (size_t)16u, 4, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr0 = top_blob.channel(q * 2);
+                    float* ptr1 = top_blob.channel(q * 2 + 1);
+
+                    __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8, 0);
+                    __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8 + 4, 0);
+
+                    int i = 0;
+                    for (; i + 1 < size; i += 2)
+                    {
+                        __builtin_prefetch(intptr + 64);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                        __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale1);
+                        _v2 = __lsx_vfmul_s(_v2, _scale0);
+                        _v3 = __lsx_vfmul_s(_v3, _scale1);
+                        __lsx_vst(_v0, ptr0, 0);
+                        __lsx_vst(_v2, ptr0 + 4, 0);
+                        __lsx_vst(_v1, ptr1, 0);
+                        __lsx_vst(_v3, ptr1 + 4, 0);
+
+                        intptr += 16;
+                        ptr0 += 8;
+                        ptr1 += 8;
+                    }
+                    for (; i < size; i++)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale1);
+                        __lsx_vst(_v0, ptr0, 0);
+                        __lsx_vst(_v1, ptr1, 0);
+
+                        intptr += 8;
+                        ptr0 += 4;
+                        ptr1 += 4;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr0 = top_blob.channel(q * 2);
+                    float* ptr1 = top_blob.channel(q * 2 + 1);
+
+                    __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8, 0);
+                    __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8 + 4, 0);
+                    __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0);
+                    __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0);
+
+                    int i = 0;
+                    for (; i + 1 < size; i += 2)
+                    {
+                        __builtin_prefetch(intptr + 64);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                        __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                        _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                        _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0);
+                        _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1);
+                        __lsx_vst(_v0, ptr0, 0);
+                        __lsx_vst(_v2, ptr0 + 4, 0);
+                        __lsx_vst(_v1, ptr1, 0);
+                        __lsx_vst(_v3, ptr1 + 4, 0);
+
+                        intptr += 16;
+                        ptr0 += 8;
+                        ptr1 += 8;
+                    }
+                    for (; i < size; i++)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                        __lsx_vst(_v0, ptr0, 0);
+                        __lsx_vst(_v1, ptr1, 0);
+
+                        intptr += 8;
+                        ptr0 += 4;
+                        ptr1 += 4;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (elempack == 4)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+
+            top_blob.create(w, (size_t)16u, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+            }
+            else
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        float* ptr = (float*)top_blob + i * 4;
+
+                        __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0);
+                        __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+
+            top_blob.create(w, h, (size_t)16u, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr = top_blob.row(i);
+
+                    __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 4, 0);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        __builtin_prefetch(intptr + 16);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale);
+                        __lsx_vst(_v, ptr, 0);
+
+                        intptr += 4;
+                        ptr += 4;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr = top_blob.row(i);
+
+                    __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 4, 0);
+                    __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        __builtin_prefetch(intptr + 16);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+
+                        intptr += 4;
+                        ptr += 4;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+
+            top_blob.create(w, h, channels, (size_t)16u, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr = top_blob.channel(q);
+
+                    __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 4, 0);
+
+                    int i = 0;
+                    for (; i + 1 < size; i += 2)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale);
+                        _v1 = __lsx_vfmul_s(_v1, _scale);
+                        __lsx_vst(_v0, ptr, 0);
+                        __lsx_vst(_v1, ptr + 4, 0);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                    for (; i < size; i++)
+                    {
+                        __builtin_prefetch(intptr + 16);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale);
+                        __lsx_vst(_v, ptr, 0);
+
+                        intptr += 4;
+                        ptr += 4;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr = top_blob.channel(q);
+
+                    __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 4, 0);
+                    __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0);
+
+                    int i = 0;
+                    for (; i + 1 < size; i += 2)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale, _v0, _bias);
+                        _v1 = __lsx_vfmadd_s(_scale, _v1, _bias);
+                        __lsx_vst(_v0, ptr, 0);
+                        __lsx_vst(_v1, ptr + 4, 0);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                    for (; i < size; i++)
+                    {
+                        __builtin_prefetch(intptr + 16);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                        __lsx_vst(_v, ptr, 0);
+
+                        intptr += 4;
+                        ptr += 4;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+#endif // __loongarch_sx
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)4u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const int* intptr = bottom_blob;
+        float* ptr = top_blob;
+
+        if (scale_data_size == 1)
+        {
+            const float scale = scale_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale;
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale + bias;
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale + bias_data[i];
+                }
+            }
+        }
+        else
+        {
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale_data[i];
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale_data[i] + bias;
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale_data[i] + bias_data[i];
+                }
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)4u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                float* ptr = top_blob.row(i);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+
+                int j = 0;
+#if __loongarch_sx
+                __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
+                for (; j + 3 < w; j += 4)
+                {
+                    __builtin_prefetch(intptr + 16);
+                    __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    _v = __lsx_vfmul_s(_v, _scale);
+                    __lsx_vst(_v, ptr, 0);
+
+                    intptr += 4;
+                    ptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; j < w; j++)
+                {
+                    *ptr++ = *intptr++ * scale;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                float* ptr = top_blob.row(i);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i];
+
+                int j = 0;
+#if __loongarch_sx
+                __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
+                __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias);
+                for (; j + 3 < w; j += 4)
+                {
+                    __builtin_prefetch(intptr + 16);
+                    __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                    __lsx_vst(_v, ptr, 0);
+
+                    intptr += 4;
+                    ptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; j < w; j++)
+                {
+                    *ptr++ = *intptr++ * scale + bias;
+                }
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)4u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                float* ptr = top_blob.channel(q);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+
+                int i = 0;
+#if __loongarch_sx
+                __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
+                for (; i + 7 < size; i += 8)
+                {
+                    __builtin_prefetch(intptr + 32);
+                    __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                    _v0 = __lsx_vfmul_s(_v0, _scale);
+                    _v1 = __lsx_vfmul_s(_v1, _scale);
+                    __lsx_vst(_v0, ptr, 0);
+                    __lsx_vst(_v1, ptr + 4, 0);
+
+                    intptr += 8;
+                    ptr += 8;
+                }
+                for (; i + 3 < size; i += 4)
+                {
+                    __builtin_prefetch(intptr + 16);
+                    __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    _v = __lsx_vfmul_s(_v, _scale);
+                    __lsx_vst(_v, ptr, 0);
+
+                    intptr += 4;
+                    ptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    *ptr++ = *intptr++ * scale;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                float* ptr = top_blob.channel(q);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q];
+
+                int i = 0;
+#if __loongarch_sx
+                __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
+                __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias);
+                for (; i + 7 < size; i += 8)
+                {
+                    __builtin_prefetch(intptr + 32);
+                    __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                    _v0 = __lsx_vfmadd_s(_scale, _v0, _bias);
+                    _v1 = __lsx_vfmadd_s(_scale, _v1, _bias);
+                    __lsx_vst(_v0, ptr, 0);
+                    __lsx_vst(_v1, ptr + 4, 0);
+
+                    intptr += 8;
+                    ptr += 8;
+                }
+                for (; i + 3 < size; i += 4)
+                {
+                    __builtin_prefetch(intptr + 16);
+                    __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                    __lsx_vst(_v, ptr, 0);
+
+                    intptr += 4;
+                    ptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    *ptr++ = *intptr++ * scale + bias;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/dequantize_loongarch.h b/src/layer/loongarch/dequantize_loongarch.h
new file mode 100644
index 00000000000..61a408d5c50
--- /dev/null
+++ b/src/layer/loongarch/dequantize_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_DEQUANTIZE_LOONGARCH_H
+#define LAYER_DEQUANTIZE_LOONGARCH_H
+
+#include "dequantize.h"
+
+namespace ncnn {
+
+class Dequantize_loongarch : virtual public Dequantize
+{
+public:
+    Dequantize_loongarch();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_DEQUANTIZE_LOONGARCH_H
diff --git a/src/layer/loongarch/dropout_loongarch.cpp b/src/layer/loongarch/dropout_loongarch.cpp
new file mode 100644
index 00000000000..04a1f9ea95d
--- /dev/null
+++ b/src/layer/loongarch/dropout_loongarch.cpp
@@ -0,0 +1,75 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "dropout_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+Dropout_loongarch::Dropout_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Dropout_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    if (scale == 1.f)
+    {
+        return 0;
+    }
+
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = __lsx_vfmul_s(_p, _scale);
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = *ptr * scale;
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/dropout_loongarch.h b/src/layer/loongarch/dropout_loongarch.h
new file mode 100644
index 00000000000..42810050677
--- /dev/null
+++ b/src/layer/loongarch/dropout_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_DROPOUT_LOONGARCH_H
+#define LAYER_DROPOUT_LOONGARCH_H
+
+#include "dropout.h"
+
+namespace ncnn {
+
+class Dropout_loongarch : virtual public Dropout
+{
+public:
+    Dropout_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_DROPOUT_LOONGARCH_H
diff --git a/src/layer/loongarch/eltwise_loongarch.cpp b/src/layer/loongarch/eltwise_loongarch.cpp
new file mode 100644
index 00000000000..d803fc3db78
--- /dev/null
+++ b/src/layer/loongarch/eltwise_loongarch.cpp
@@ -0,0 +1,332 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "eltwise_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+Eltwise_loongarch::Eltwise_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Eltwise_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int elempack = bottom_blob.elempack;
+    int size = w * h * elempack;
+
+    Mat& top_blob = top_blobs[0];
+    top_blob.create_like(bottom_blob, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (op_type == Operation_PROD)
+    {
+        // first blob
+        const Mat& bottom_blob1 = bottom_blobs[1];
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            const float* ptr1 = bottom_blob1.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            int i = 0;
+#if __loongarch_sx
+            for (; i + 3 < size; i += 4)
+            {
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                _p = __lsx_vfmul_s(_p, _p1);
+                __lsx_vst(_p, outptr, 0);
+
+                ptr += 4;
+                ptr1 += 4;
+                outptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                *outptr = *ptr * *ptr1;
+
+                ptr++;
+                ptr1++;
+                outptr++;
+            }
+        }
+
+        for (size_t b = 2; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob1 = bottom_blobs[b];
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                int i = 0;
+#if __loongarch_sx
+                for (; i + 3 < size; i += 4)
+                {
+                    __m128 _p = (__m128)__lsx_vld(outptr, 0);
+                    __m128 _p1 = (__m128)__lsx_vld(ptr, 0);
+                    _p = __lsx_vfmul_s(_p, _p1);
+                    __lsx_vst(_p, outptr, 0);
+
+                    ptr += 4;
+                    outptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    *outptr *= *ptr;
+
+                    ptr++;
+                    outptr++;
+                }
+            }
+        }
+    }
+    if (op_type == Operation_SUM)
+    {
+        if (coeffs.w == 0)
+        {
+            // first blob
+            const Mat& bottom_blob1 = bottom_blobs[1];
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                const float* ptr1 = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                int i = 0;
+#if __loongarch_sx
+                for (; i + 3 < size; i += 4)
+                {
+                    __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                    __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                    _p = __lsx_vfadd_s(_p, _p1);
+                    __lsx_vst(_p, outptr, 0);
+
+                    ptr += 4;
+                    ptr1 += 4;
+                    outptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    *outptr = *ptr + *ptr1;
+
+                    ptr++;
+                    ptr1++;
+                    outptr++;
+                }
+            }
+
+            for (size_t b = 2; b < bottom_blobs.size(); b++)
+            {
+                const Mat& bottom_blob1 = bottom_blobs[b];
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const float* ptr = bottom_blob1.channel(q);
+                    float* outptr = top_blob.channel(q);
+
+                    int i = 0;
+#if __loongarch_sx
+                    for (; i + 3 < size; i += 4)
+                    {
+                        __m128 _p = (__m128)__lsx_vld(outptr, 0);
+                        __m128 _p1 = (__m128)__lsx_vld(ptr, 0);
+                        _p = __lsx_vfadd_s(_p, _p1);
+                        __lsx_vst(_p, outptr, 0);
+
+                        ptr += 4;
+                        outptr += 4;
+                    }
+#endif // __loongarch_sx
+                    for (; i < size; i++)
+                    {
+                        *outptr += *ptr;
+
+                        ptr++;
+                        outptr++;
+                    }
+                }
+            }
+        }
+        else
+        {
+            // first blob
+            const Mat& bottom_blob1 = bottom_blobs[1];
+            float coeff0 = coeffs[0];
+            float coeff1 = coeffs[1];
+#if __loongarch_sx
+            __m128 _coeff0 = (__m128)__lsx_vreplfr2vr_s(coeff0);
+            __m128 _coeff1 = (__m128)__lsx_vreplfr2vr_s(coeff1);
+#endif // __loongarch_sx
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                const float* ptr1 = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                int i = 0;
+#if __loongarch_sx
+                for (; i + 3 < size; i += 4)
+                {
+                    __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                    __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                    _p = __lsx_vfmul_s(_p, _coeff0);
+                    _p = __lsx_vfmadd_s(_coeff1, _p1, _p);
+                    __lsx_vst(_p, outptr, 0);
+
+                    ptr += 4;
+                    ptr1 += 4;
+                    outptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    *outptr = *ptr * coeff0 + *ptr1 * coeff1;
+
+                    ptr++;
+                    ptr1++;
+                    outptr++;
+                }
+            }
+
+            for (size_t b = 2; b < bottom_blobs.size(); b++)
+            {
+                const Mat& bottom_blob1 = bottom_blobs[b];
+                float coeff = coeffs[b];
+#if __loongarch_sx
+                __m128 _coeff = (__m128)__lsx_vreplfr2vr_s(coeff);
+#endif // __loongarch_sx
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const float* ptr = bottom_blob1.channel(q);
+                    float* outptr = top_blob.channel(q);
+
+                    int i = 0;
+#if __loongarch_sx
+                    for (; i + 3 < size; i += 4)
+                    {
+                        __m128 _p = (__m128)__lsx_vld(outptr, 0);
+                        __m128 _p1 = (__m128)__lsx_vld(ptr, 0);
+                        _p = __lsx_vfmadd_s(_coeff, _p1, _p);
+                        __lsx_vst(_p, outptr, 0);
+
+                        ptr += 4;
+                        outptr += 4;
+                    }
+#endif // __loongarch_sx
+                    for (; i < size; i++)
+                    {
+                        *outptr += *ptr * coeff;
+
+                        ptr++;
+                        outptr++;
+                    }
+                }
+            }
+        }
+    }
+    if (op_type == Operation_MAX)
+    {
+        // first blob
+        const Mat& bottom_blob1 = bottom_blobs[1];
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            const float* ptr1 = bottom_blob1.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            int i = 0;
+#if __loongarch_sx
+            for (; i + 3 < size; i += 4)
+            {
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
+                _p = __lsx_vfmax_s(_p, _p1);
+                __lsx_vst(_p, outptr, 0);
+
+                ptr += 4;
+                ptr1 += 4;
+                outptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                *outptr = std::max(*ptr, *ptr1);
+
+                ptr++;
+                ptr1++;
+                outptr++;
+            }
+        }
+
+        for (size_t b = 2; b < bottom_blobs.size(); b++)
+        {
+            const Mat& bottom_blob1 = bottom_blobs[b];
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_blob1.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                int i = 0;
+#if __loongarch_sx
+                for (; i + 3 < size; i += 4)
+                {
+                    __m128 _p = (__m128)__lsx_vld(outptr, 0);
+                    __m128 _p1 = (__m128)__lsx_vld(ptr, 0);
+                    _p = __lsx_vfmax_s(_p, _p1);
+                    __lsx_vst(_p, outptr, 0);
+
+                    ptr += 4;
+                    outptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    *outptr = std::max(*ptr, *outptr);
+
+                    ptr++;
+                    outptr++;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/eltwise_loongarch.h b/src/layer/loongarch/eltwise_loongarch.h
new file mode 100644
index 00000000000..f9715b20cad
--- /dev/null
+++ b/src/layer/loongarch/eltwise_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_ELTWISE_LOONGARCH_H
+#define LAYER_ELTWISE_LOONGARCH_H
+
+#include "eltwise.h"
+
+namespace ncnn {
+
+class Eltwise_loongarch : virtual public Eltwise
+{
+public:
+    Eltwise_loongarch();
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ELTWISE_LOONGARCH_H
diff --git a/src/layer/loongarch/flatten_loongarch.cpp b/src/layer/loongarch/flatten_loongarch.cpp
new file mode 100644
index 00000000000..6d9a8636287
--- /dev/null
+++ b/src/layer/loongarch/flatten_loongarch.cpp
@@ -0,0 +1,370 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "flatten_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+Flatten_loongarch::Flatten_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Flatten_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int elembits = bottom_blob.elembits();
+
+    if (elembits == 8)
+        return forward_int8(bottom_blob, top_blob, opt);
+
+    int dims = bottom_blob.dims;
+
+    if (dims == 1)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+    int size = w * h * d;
+
+    int total = size * channels * elempack;
+
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = total % 4 == 0 ? 4 : 1;
+    }
+#endif
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    if (out_elempack == 1)
+    {
+        return Flatten::forward(bottom_blob, top_blob, opt);
+    }
+
+    if (dims == 2 && elempack == 1) // out_elempack == 4
+    {
+        top_blob = bottom_blob;
+        top_blob.dims = 1;
+        top_blob.w = total / out_elempack;
+        top_blob.h = 1;
+        top_blob.cstep = top_blob.w;
+        top_blob.elemsize = out_elemsize;
+        top_blob.elempack = out_elempack;
+        return 0;
+    }
+
+    top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (dims == 2)
+    {
+#if __loongarch_sx
+        if (elempack == 4) // out_elempack == 4
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const float* ptr = bottom_blob.row(i);
+                float* outptr0 = (float*)top_blob + w * i * 4;
+                float* outptr1 = (float*)top_blob + w * (i * 4 + 1);
+                float* outptr2 = (float*)top_blob + w * (i * 4 + 2);
+                float* outptr3 = (float*)top_blob + w * (i * 4 + 3);
+
+                int j = 0;
+                for (; j + 3 < w; j += 4)
+                {
+                    // transpose 4x4
+                    __m128i _r0 = __lsx_vld(ptr, 0);
+                    __m128i _r1 = __lsx_vld(ptr + 4, 0);
+                    __m128i _r2 = __lsx_vld(ptr + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(ptr + 4 * 3, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                    __lsx_vst(_r0123_0, outptr0, 0);
+                    __lsx_vst(_r0123_1, outptr1, 0);
+                    __lsx_vst(_r0123_2, outptr2, 0);
+                    __lsx_vst(_r0123_3, outptr3, 0);
+
+                    ptr += 16;
+                    outptr0 += 4;
+                    outptr1 += 4;
+                    outptr2 += 4;
+                    outptr3 += 4;
+                }
+                for (; j < w; j++)
+                {
+                    *outptr0++ = ptr[0];
+                    *outptr1++ = ptr[1];
+                    *outptr2++ = ptr[2];
+                    *outptr3++ = ptr[3];
+
+                    ptr += 4;
+                }
+            }
+        }
+#endif // __loongarch_sx
+    }
+
+    if (dims == 3 || dims == 4)
+    {
+#if __loongarch_sx
+        if (elempack == 4) // out_elempack == 4
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                float* outptr0 = (float*)top_blob + size * q * 4;
+                float* outptr1 = (float*)top_blob + size * (q * 4 + 1);
+                float* outptr2 = (float*)top_blob + size * (q * 4 + 2);
+                float* outptr3 = (float*)top_blob + size * (q * 4 + 3);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    // transpose 4x4
+                    __m128i _r0 = __lsx_vld(ptr, 0);
+                    __m128i _r1 = __lsx_vld(ptr + 4, 0);
+                    __m128i _r2 = __lsx_vld(ptr + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(ptr + 4 * 3, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                    __lsx_vst(_r0123_0, outptr0, 0);
+                    __lsx_vst(_r0123_1, outptr1, 0);
+                    __lsx_vst(_r0123_2, outptr2, 0);
+                    __lsx_vst(_r0123_3, outptr3, 0);
+
+                    ptr += 16;
+                    outptr0 += 4;
+                    outptr1 += 4;
+                    outptr2 += 4;
+                    outptr3 += 4;
+                }
+                for (; i < size; i++)
+                {
+                    *outptr0++ = ptr[0];
+                    *outptr1++ = ptr[1];
+                    *outptr2++ = ptr[2];
+                    *outptr3++ = ptr[3];
+
+                    ptr += 4;
+                }
+            }
+        }
+#endif // __loongarch_sx
+
+        if (elempack == 1) // out_elempack == 4
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* ptr = bottom_blob.channel(q);
+                float* outptr = (float*)top_blob + size * q;
+
+                int i = 0;
+#if __loongarch_sx
+                for (; i + 3 < size; i += 4)
+                {
+                    __lsx_vst(__lsx_vld(ptr, 0), outptr, 0);
+                    ptr += 4;
+                    outptr += 4;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    *outptr++ = *ptr++;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+int Flatten_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+
+    if (dims == 1)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+    int size = w * h * d;
+
+    int total = size * channels * elempack;
+
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = total % 8 == 0 ? 8 : 1;
+    }
+#endif
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    if (out_elempack == 1)
+    {
+        return Flatten::forward(bottom_blob, top_blob, opt);
+    }
+
+    if (dims == 2 && elempack == 1) // out_elempack == 8
+    {
+        top_blob = bottom_blob;
+        top_blob.dims = 1;
+        top_blob.w = total / out_elempack;
+        top_blob.h = 1;
+        top_blob.cstep = top_blob.w;
+        top_blob.elemsize = out_elemsize;
+        top_blob.elempack = out_elempack;
+        return 0;
+    }
+
+    top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (dims == 2)
+    {
+#if __loongarch_sx
+        if (elempack == 8) // out_elempack == 8
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const signed char* ptr = bottom_blob.row<signed char>(i);
+                signed char* outptr0 = (signed char*)top_blob + w * i * 8;
+                signed char* outptr1 = (signed char*)top_blob + w * (i * 8 + 1);
+                signed char* outptr2 = (signed char*)top_blob + w * (i * 8 + 2);
+                signed char* outptr3 = (signed char*)top_blob + w * (i * 8 + 3);
+                signed char* outptr4 = (signed char*)top_blob + w * (i * 8 + 4);
+                signed char* outptr5 = (signed char*)top_blob + w * (i * 8 + 5);
+                signed char* outptr6 = (signed char*)top_blob + w * (i * 8 + 6);
+                signed char* outptr7 = (signed char*)top_blob + w * (i * 8 + 7);
+
+                int j = 0;
+                for (; j < w; j++)
+                {
+                    *outptr0++ = ptr[0];
+                    *outptr1++ = ptr[1];
+                    *outptr2++ = ptr[2];
+                    *outptr3++ = ptr[3];
+                    *outptr4++ = ptr[4];
+                    *outptr5++ = ptr[5];
+                    *outptr6++ = ptr[6];
+                    *outptr7++ = ptr[7];
+
+                    ptr += 8;
+                }
+            }
+        }
+#endif // __loongarch_sx
+    }
+
+    if (dims == 3 || dims == 4)
+    {
+#if __loongarch_sx
+        if (elempack == 8) // out_elempack == 8
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const signed char* ptr = bottom_blob.channel(q);
+                signed char* outptr0 = (signed char*)top_blob + size * q * 8;
+                signed char* outptr1 = (signed char*)top_blob + size * (q * 8 + 1);
+                signed char* outptr2 = (signed char*)top_blob + size * (q * 8 + 2);
+                signed char* outptr3 = (signed char*)top_blob + size * (q * 8 + 3);
+                signed char* outptr4 = (signed char*)top_blob + size * (q * 8 + 4);
+                signed char* outptr5 = (signed char*)top_blob + size * (q * 8 + 5);
+                signed char* outptr6 = (signed char*)top_blob + size * (q * 8 + 6);
+                signed char* outptr7 = (signed char*)top_blob + size * (q * 8 + 7);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    *outptr0++ = ptr[0];
+                    *outptr1++ = ptr[1];
+                    *outptr2++ = ptr[2];
+                    *outptr3++ = ptr[3];
+                    *outptr4++ = ptr[4];
+                    *outptr5++ = ptr[5];
+                    *outptr6++ = ptr[6];
+                    *outptr7++ = ptr[7];
+
+                    ptr += 8;
+                }
+            }
+        }
+#endif // __loongarch_sx
+
+        if (elempack == 1) // out_elempack == 8
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const signed char* ptr = bottom_blob.channel(q);
+                signed char* outptr = (signed char*)top_blob + size * q;
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    *outptr++ = *ptr++;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/flatten_loongarch.h b/src/layer/loongarch/flatten_loongarch.h
new file mode 100644
index 00000000000..afd35c701f5
--- /dev/null
+++ b/src/layer/loongarch/flatten_loongarch.h
@@ -0,0 +1,35 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_FLATTEN_LOONGARCH_H
+#define LAYER_FLATTEN_LOONGARCH_H
+
+#include "flatten.h"
+
+namespace ncnn {
+
+class Flatten_loongarch : virtual public Flatten
+{
+public:
+    Flatten_loongarch();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+protected:
+    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_FLATTEN_LOONGARCH_H
diff --git a/src/layer/loongarch/hardsigmoid_loongarch.cpp b/src/layer/loongarch/hardsigmoid_loongarch.cpp
new file mode 100644
index 00000000000..9dfedb689bc
--- /dev/null
+++ b/src/layer/loongarch/hardsigmoid_loongarch.cpp
@@ -0,0 +1,79 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "hardsigmoid_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+HardSigmoid_loongarch::HardSigmoid_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int HardSigmoid_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
+        __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(alpha);
+        __m128 _beta = (__m128)__lsx_vreplfr2vr_s(beta);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = __lsx_vfmadd_s(_alpha, _p, _beta);
+            _p = __lsx_vfmax_s(_p, _zero);
+            _p = __lsx_vfmin_s(_p, _one);
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            if (*ptr < lower)
+                *ptr = 0.f;
+            else if (*ptr > upper)
+                *ptr = 1.f;
+            else
+                *ptr = *ptr * alpha + beta;
+            ++ptr;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/hardsigmoid_loongarch.h b/src/layer/loongarch/hardsigmoid_loongarch.h
new file mode 100644
index 00000000000..755ae89ff03
--- /dev/null
+++ b/src/layer/loongarch/hardsigmoid_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_HARDSIGMOID_LOONGARCH_H
+#define LAYER_HARDSIGMOID_LOONGARCH_H
+
+#include "hardsigmoid.h"
+
+namespace ncnn {
+
+class HardSigmoid_loongarch : virtual public HardSigmoid
+{
+public:
+    HardSigmoid_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_HARDSIGMOID_LOONGARCH_H
diff --git a/src/layer/loongarch/hardswish_loongarch.cpp b/src/layer/loongarch/hardswish_loongarch.cpp
new file mode 100644
index 00000000000..f1417a7986c
--- /dev/null
+++ b/src/layer/loongarch/hardswish_loongarch.cpp
@@ -0,0 +1,80 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "hardswish_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+HardSwish_loongarch::HardSwish_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int HardSwish_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
+        __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(alpha);
+        __m128 _beta = (__m128)__lsx_vreplfr2vr_s(beta);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __m128 _outp = __lsx_vfmadd_s(_alpha, _p, _beta);
+            _outp = __lsx_vfmax_s(_outp, _zero);
+            _outp = __lsx_vfmin_s(_outp, _one);
+            _outp = __lsx_vfmul_s(_outp, _p);
+            __lsx_vst(_outp, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            if (*ptr < lower)
+                *ptr = 0.f;
+            else if (*ptr > upper)
+                ;
+            else
+                *ptr = *ptr * (*ptr * alpha + beta);
+            ++ptr;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/hardswish_loongarch.h b/src/layer/loongarch/hardswish_loongarch.h
new file mode 100644
index 00000000000..e9b0821245c
--- /dev/null
+++ b/src/layer/loongarch/hardswish_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_HARDSWISH_LOONGARCH_H
+#define LAYER_HARDSWISH_LOONGARCH_H
+
+#include "hardswish.h"
+
+namespace ncnn {
+
+class HardSwish_loongarch : virtual public HardSwish
+{
+public:
+    HardSwish_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_HARDSWISH_LOONGARCH_H
diff --git a/src/layer/loongarch/innerproduct_loongarch.cpp b/src/layer/loongarch/innerproduct_loongarch.cpp
new file mode 100644
index 00000000000..3dd6ff35e23
--- /dev/null
+++ b/src/layer/loongarch/innerproduct_loongarch.cpp
@@ -0,0 +1,1637 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "innerproduct_loongarch.h"
+
+#include "layer_type.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+#include "loongarch_activation.h"
+
+namespace ncnn {
+
+InnerProduct_loongarch::InnerProduct_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+
+    flatten = 0;
+}
+
+int InnerProduct_loongarch::create_pipeline(const Option& opt)
+{
+    {
+        flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+
+        ncnn::ParamDict pd;
+
+        flatten->load_param(pd);
+
+        flatten->create_pipeline(opt);
+    }
+
+#if NCNN_INT8
+    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
+    {
+        return create_pipeline_int8_loongarch(opt);
+    }
+#endif
+
+#if __loongarch_sx
+    if (opt.use_fp16_storage)
+    {
+        return create_pipeline_fp16s(opt);
+    }
+#endif
+
+    const int num_input = weight_data_size / num_output;
+
+    int out_elempack = 1;
+
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __loongarch_sx
+
+    if (out_elempack == 4)
+    {
+        // src = inch-outch
+        // dst = 4-inch-outch/4
+        {
+            Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
+
+            weight_data_tm.create(num_input, num_output / 4, (size_t)4u * 4, 4);
+
+            for (int q = 0; q + 3 < num_output; q += 4)
+            {
+                float* g0 = weight_data_tm.row(q / 4);
+
+                for (int p = 0; p < num_input; p++)
+                {
+                    for (int j = 0; j < 4; j++)
+                    {
+                        *g0++ = weight_data_r2.row(q + j)[p];
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        weight_data_tm = weight_data;
+    }
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int InnerProduct_loongarch::destroy_pipeline(const Option& opt)
+{
+    if (flatten)
+    {
+        flatten->destroy_pipeline(opt);
+        delete flatten;
+        flatten = 0;
+    }
+
+    return 0;
+}
+
+int InnerProduct_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+#if NCNN_INT8
+    if (opt.use_int8_inference && int8_scale_term)
+    {
+        return forward_int8_loongarch(bottom_blob, top_blob, opt);
+    }
+#endif
+
+#if __loongarch_sx
+    if (opt.use_fp16_storage)
+    {
+        return forward_fp16s(bottom_blob, top_blob, opt);
+    }
+#endif
+
+    const int num_input = weight_data_size / num_output;
+
+    if (bottom_blob.dims == 2 && bottom_blob.w == num_input && bottom_blob.h * bottom_blob.elempack > 1)
+    {
+        // gemm
+        int h = bottom_blob.h;
+        size_t elemsize = bottom_blob.elemsize;
+        int elempack = bottom_blob.elempack;
+
+        top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        int num_output_elempack = 1;
+#if __loongarch_sx
+        if (opt.use_packing_layout)
+        {
+            num_output_elempack = num_output % 4 == 0 ? 4 : 1;
+        }
+#endif
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int j = 0; j < h; j++)
+        {
+#if __loongarch_sx
+            if (elempack == 4 && num_output_elempack == 4)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const float* kptr = weight_data_tm.row(p);
+                    const float* m = bottom_blob.row(j);
+
+                    __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    if (bias_term)
+                    {
+                        _sum0 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 0]);
+                        _sum1 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 1]);
+                        _sum2 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 2]);
+                        _sum3 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 3]);
+                    }
+
+                    int i = 0;
+                    for (; i < num_input; i++)
+                    {
+                        __builtin_prefetch(m + 16);
+                        __builtin_prefetch(kptr + 16);
+                        __m128 _val = (__m128)__lsx_vld(m, 0);
+                        __m128i _w = __lsx_vld(kptr, 0);
+                        _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 0), _val, _sum0);
+                        _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 1), _val, _sum1);
+                        _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 2), _val, _sum2);
+                        _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 3), _val, _sum3);
+
+                        m += 4;
+                        kptr += 4;
+                    }
+
+                    _sum0 = activation_ps(_sum0, activation_type, activation_params);
+                    _sum1 = activation_ps(_sum1, activation_type, activation_params);
+                    _sum2 = activation_ps(_sum2, activation_type, activation_params);
+                    _sum3 = activation_ps(_sum3, activation_type, activation_params);
+
+                    __lsx_vst(_sum0, outptr, 0);
+                    __lsx_vst(_sum1, outptr + 4, 0);
+                    __lsx_vst(_sum2, outptr + 8, 0);
+                    __lsx_vst(_sum3, outptr + 12, 0);
+                    outptr += 16;
+                }
+            }
+
+            if (elempack == 1 && num_output_elempack == 4)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const float* kptr = weight_data_tm.row(p);
+                    const float* m = bottom_blob.row(j);
+
+                    __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    if (bias_term)
+                    {
+                        _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
+                    }
+
+                    int i = 0;
+                    for (; i + 3 < num_input; i += 4)
+                    {
+                        __builtin_prefetch(m + 16);
+                        __builtin_prefetch(kptr + 64);
+                        __m128i _val = __lsx_vld(m, 0);
+                        __m128 _w0 = (__m128)__lsx_vld(kptr, 0);
+                        __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0);
+                        __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0);
+                        __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0);
+                        _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0);
+                        _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1);
+                        _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2);
+                        _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3);
+
+                        m += 4;
+                        kptr += 16;
+                    }
+                    for (; i < num_input; i++)
+                    {
+                        __m128 _val = __lsx_vreplfr2vr_s(m[0]);
+                        __m128 _w = (__m128)__lsx_vld(kptr, 0);
+                        _sum0 = __lsx_vfmadd_s(_w, _val, _sum0);
+
+                        m += 1;
+                        kptr += 4;
+                    }
+
+                    _sum0 = __lsx_vfadd_s(_sum0, _sum1);
+                    _sum2 = __lsx_vfadd_s(_sum2, _sum3);
+                    _sum0 = __lsx_vfadd_s(_sum0, _sum2);
+
+                    _sum0 = activation_ps(_sum0, activation_type, activation_params);
+
+                    __lsx_vst(_sum0, outptr, 0);
+                    outptr += 4;
+                }
+            }
+
+            if (elempack == 4 && num_output_elempack == 1)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const float* kptr = (const float*)weight_data_tm + num_input * p;
+                    const float* m = bottom_blob.row(j);
+
+                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    if (bias_term)
+                    {
+                        _sum = __lsx_vreplfr2vr_s(bias_data[p]);
+                    }
+
+                    for (int i = 0; i < num_input; i++)
+                    {
+                        __builtin_prefetch(m + 16);
+                        __builtin_prefetch(kptr + 4);
+                        __m128 _val = (__m128)__lsx_vld(m, 0);
+                        __m128 _k = __lsx_vreplfr2vr_s(kptr[0]);
+                        _sum = __lsx_vfmadd_s(_k, _val, _sum);
+
+                        m += 4;
+                        kptr += 1;
+                    }
+
+                    _sum = activation_ps(_sum, activation_type, activation_params);
+
+                    __lsx_vst(_sum, outptr, 0);
+                    outptr += 4;
+                }
+            }
+#endif // __loongarch_sx
+
+            if (elempack == 1 && num_output_elempack == 1)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const float* kptr = (const float*)weight_data_tm + num_input * p;
+                    const float* m = bottom_blob.row(j);
+
+                    float sum = 0.f;
+
+                    if (bias_term)
+                    {
+                        sum = bias_data[p];
+                    }
+
+                    int i = 0;
+#if __loongarch_sx
+                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+                    for (; i + 3 < num_input; i += 4)
+                    {
+                        __builtin_prefetch(m + 16);
+                        __builtin_prefetch(kptr + 16);
+                        __m128 _m = (__m128)__lsx_vld(m, 0);
+                        __m128 _w = (__m128)__lsx_vld(kptr, 0);
+                        _sum = __lsx_vfmadd_s(_w, _m, _sum);
+
+                        m += 4;
+                        kptr += 4;
+                    }
+                    sum += __lsx_reduce_fadd_s(_sum);
+#endif // __loongarch_sx
+                    for (; i < num_input; i++)
+                    {
+                        sum += *m * *kptr;
+
+                        m += 1;
+                        kptr += 1;
+                    }
+
+                    sum = activation_ss(sum, activation_type, activation_params);
+
+                    outptr[0] = sum;
+                    outptr += 1;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    // flatten
+    Mat bottom_blob_flattened = bottom_blob;
+    if (bottom_blob.dims != 1)
+    {
+        Option opt_flatten = opt;
+        opt_flatten.blob_allocator = opt.workspace_allocator;
+
+        flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
+    }
+
+    size_t elemsize = bottom_blob_flattened.elemsize;
+    int elempack = bottom_blob_flattened.elempack;
+
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __loongarch_sx
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+#if __loongarch_sx
+    if (out_elempack == 4)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < num_output / out_elempack; p++)
+        {
+            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+
+            if (bias_term)
+            {
+                _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
+            }
+
+            const float* kptr = weight_data_tm.row(p);
+
+            const float* sptr = bottom_blob_flattened;
+
+            int i = 0;
+            for (; i + 3 < num_input; i += 4)
+            {
+                __builtin_prefetch(sptr + 16);
+                __builtin_prefetch(kptr + 64);
+                __m128i _val = __lsx_vld(sptr, 0);
+                __m128 _w0 = (__m128)__lsx_vld(kptr, 0);
+                __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0);
+                __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0);
+                __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0);
+                _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0);
+                _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1);
+                _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2);
+                _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3);
+
+                sptr += 4;
+                kptr += 16;
+            }
+            for (; i < num_input; i++)
+            {
+                __m128 _val = __lsx_vreplfr2vr_s(sptr[0]);
+                __m128 _w = (__m128)__lsx_vld(kptr, 0);
+                _sum0 = __lsx_vfmadd_s(_w, _val, _sum0);
+
+                sptr += 1;
+                kptr += 4;
+            }
+
+            _sum0 = __lsx_vfadd_s(_sum0, _sum1);
+            _sum2 = __lsx_vfadd_s(_sum2, _sum3);
+            _sum0 = __lsx_vfadd_s(_sum0, _sum2);
+
+            _sum0 = activation_ps(_sum0, activation_type, activation_params);
+
+            float* outptr = top_blob;
+            __lsx_vst(_sum0, outptr + p * 4, 0);
+        }
+    }
+#endif // __loongarch_sx
+
+    if (out_elempack == 1)
+    {
+        int nn_num_output = num_output / 4;
+        int remain_num_output_start = nn_num_output * 4;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int pp = 0; pp < nn_num_output; pp++)
+        {
+            int p = pp * 4;
+
+            float sum0 = 0.f;
+            float sum1 = 0.f;
+            float sum2 = 0.f;
+            float sum3 = 0.f;
+
+            if (bias_term)
+            {
+                sum0 = bias_data[p];
+                sum1 = bias_data[p + 1];
+                sum2 = bias_data[p + 2];
+                sum3 = bias_data[p + 3];
+            }
+
+            const float* w0 = (const float*)weight_data_tm + num_input * p;
+            const float* w1 = (const float*)weight_data_tm + num_input * (p + 1);
+            const float* w2 = (const float*)weight_data_tm + num_input * (p + 2);
+            const float* w3 = (const float*)weight_data_tm + num_input * (p + 3);
+
+            const float* m = bottom_blob_flattened;
+
+            int i = 0;
+#if __loongarch_sx
+            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+            for (; i + 3 < num_input; i += 4)
+            {
+                __builtin_prefetch(m + 16);
+                __builtin_prefetch(w0 + 16);
+                __builtin_prefetch(w1 + 16);
+                __builtin_prefetch(w2 + 16);
+                __builtin_prefetch(w3 + 16);
+                __m128 _m = (__m128)__lsx_vld(m, 0);
+                __m128 _w0 = (__m128)__lsx_vld(w0, 0);
+                __m128 _w1 = (__m128)__lsx_vld(w1, 0);
+                __m128 _w2 = (__m128)__lsx_vld(w2, 0);
+                __m128 _w3 = (__m128)__lsx_vld(w3, 0);
+                _sum0 = __lsx_vfmadd_s(_w0, _m, _sum0);
+                _sum1 = __lsx_vfmadd_s(_w1, _m, _sum1);
+                _sum2 = __lsx_vfmadd_s(_w2, _m, _sum2);
+                _sum3 = __lsx_vfmadd_s(_w3, _m, _sum3);
+
+                m += 4;
+                w0 += 4;
+                w1 += 4;
+                w2 += 4;
+                w3 += 4;
+            }
+#endif // __loongarch_sx
+            for (; i < num_input; i++)
+            {
+                sum0 += *m * *w0;
+                sum1 += *m * *w1;
+                sum2 += *m * *w2;
+                sum3 += *m * *w3;
+
+                m++;
+                w0++;
+                w1++;
+                w2++;
+                w3++;
+            }
+
+#if __loongarch_sx
+            sum0 += __lsx_reduce_fadd_s(_sum0);
+            sum1 += __lsx_reduce_fadd_s(_sum1);
+            sum2 += __lsx_reduce_fadd_s(_sum2);
+            sum3 += __lsx_reduce_fadd_s(_sum3);
+#endif // __loongarch_sx
+
+            sum0 = activation_ss(sum0, activation_type, activation_params);
+            sum1 = activation_ss(sum1, activation_type, activation_params);
+            sum2 = activation_ss(sum2, activation_type, activation_params);
+            sum3 = activation_ss(sum3, activation_type, activation_params);
+
+            top_blob[p] = sum0;
+            top_blob[p + 1] = sum1;
+            top_blob[p + 2] = sum2;
+            top_blob[p + 3] = sum3;
+        }
+
+        // num_output
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = remain_num_output_start; p < num_output; p++)
+        {
+            float sum = 0.f;
+
+            if (bias_term)
+                sum = bias_data[p];
+
+            const float* w = (const float*)weight_data_tm + num_input * p;
+
+            const float* m = bottom_blob_flattened;
+
+            int i = 0;
+#if __loongarch_sx
+            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+            for (; i + 3 < num_input; i += 4)
+            {
+                __builtin_prefetch(m + 16);
+                __builtin_prefetch(w + 16);
+                __m128 _m = (__m128)__lsx_vld(m, 0);
+                __m128 _w = (__m128)__lsx_vld(w, 0);
+                _sum0 = __lsx_vfmadd_s(_w, _m, _sum0);
+
+                m += 4;
+                w += 4;
+            }
+            sum += __lsx_reduce_fadd_s(_sum0);
+#endif // __loongarch_sx
+            for (; i < num_input; i++)
+            {
+                sum += *m * *w;
+
+                m++;
+                w++;
+            }
+
+            sum = activation_ss(sum, activation_type, activation_params);
+
+            top_blob[p] = sum;
+        }
+    }
+
+    return 0;
+}
+
+#if __loongarch_sx
+int InnerProduct_loongarch::create_pipeline_fp16s(const Option& opt)
+{
+    const int num_input = weight_data_size / num_output;
+
+    int out_elempack = 1;
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+
+    // src = inch-outch
+    // dst = pb-inch-outch/pb
+    if (out_elempack == 4)
+    {
+        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
+
+        weight_data_tm.create(num_input, num_output / 4, (size_t)8u, 4);
+
+        for (int q = 0; q + 3 < num_output; q += 4)
+        {
+            unsigned short* g0 = weight_data_tm.row<unsigned short>(q / 4);
+
+            const float* k0 = weight_data_r2.row(q);
+            const float* k1 = weight_data_r2.row(q + 1);
+            const float* k2 = weight_data_r2.row(q + 2);
+            const float* k3 = weight_data_r2.row(q + 3);
+
+            int p = 0;
+            for (; p + 3 < num_input; p += 4)
+            {
+                // transpose 4x4
+                __m128i _r0 = __lsx_vld(k0, 0);
+                __m128i _r1 = __lsx_vld(k1, 0);
+                __m128i _r2 = __lsx_vld(k2, 0);
+                __m128i _r3 = __lsx_vld(k3, 0);
+
+                __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                __m128i _p0 = __lsx_vfcvt_h_s((__m128)_r0123_1, (__m128)_r0123_0);
+                __m128i _p1 = __lsx_vfcvt_h_s((__m128)_r0123_3, (__m128)_r0123_2);
+
+                __lsx_vst(_p0, g0, 0);
+                __lsx_vst(_p1, g0 + 8, 0);
+
+                k0 += 4;
+                k1 += 4;
+                k2 += 4;
+                k3 += 4;
+                g0 += 16;
+            }
+            for (; p < num_input; p++)
+            {
+                g0[0] = float32_to_float16(*k0++);
+                g0[1] = float32_to_float16(*k1++);
+                g0[2] = float32_to_float16(*k2++);
+                g0[3] = float32_to_float16(*k3++);
+                g0 += 4;
+            }
+        }
+    }
+
+    if (out_elempack == 1)
+    {
+        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
+        ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt);
+    }
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int InnerProduct_loongarch::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    const int num_input = weight_data_size / num_output;
+
+    if (bottom_blob.dims == 2 && bottom_blob.w == num_input && bottom_blob.h * bottom_blob.elempack > 1)
+    {
+        // gemm
+        int h = bottom_blob.h;
+        size_t elemsize = bottom_blob.elemsize;
+        int elempack = bottom_blob.elempack;
+
+        top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        int num_output_elempack = 1;
+        if (opt.use_packing_layout)
+        {
+            num_output_elempack = num_output % 4 == 0 ? 4 : 1;
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int j = 0; j < h; j++)
+        {
+            if (elempack == 4 && num_output_elempack == 4)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+                    const float* m = bottom_blob.row(j);
+
+                    __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    if (bias_term)
+                    {
+                        _sum0 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 0]);
+                        _sum1 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 1]);
+                        _sum2 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 2]);
+                        _sum3 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 3]);
+                    }
+
+                    int i = 0;
+                    for (; i < num_input; i++)
+                    {
+                        __builtin_prefetch(m + 16);
+                        __builtin_prefetch(kptr + 16);
+                        __m128 _val = (__m128)__lsx_vld(m, 0);
+                        __m128i _w = (__m128i)__lsx_vfcvtl_s_h(__lsx_vld(kptr, 0));
+                        _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 0), _val, _sum0);
+                        _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 1), _val, _sum1);
+                        _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 2), _val, _sum2);
+                        _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 3), _val, _sum3);
+
+                        m += 4;
+                        kptr += 4;
+                    }
+
+                    _sum0 = activation_ps(_sum0, activation_type, activation_params);
+                    _sum1 = activation_ps(_sum1, activation_type, activation_params);
+                    _sum2 = activation_ps(_sum2, activation_type, activation_params);
+                    _sum3 = activation_ps(_sum3, activation_type, activation_params);
+
+                    __lsx_vst(_sum0, outptr, 0);
+                    __lsx_vst(_sum1, outptr + 4, 0);
+                    __lsx_vst(_sum2, outptr + 8, 0);
+                    __lsx_vst(_sum3, outptr + 12, 0);
+                    outptr += 16;
+                }
+            }
+
+            if (elempack == 1 && num_output_elempack == 4)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+                    const float* m = bottom_blob.row(j);
+
+                    __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+                    __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    if (bias_term)
+                    {
+                        _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
+                    }
+
+                    int i = 0;
+                    for (; i + 3 < num_input; i += 4)
+                    {
+                        __builtin_prefetch(m + 16);
+                        __builtin_prefetch(kptr + 64);
+                        __m128i _val = __lsx_vld(m, 0);
+                        __m128i _w01 = __lsx_vld(kptr, 0);
+                        __m128i _w23 = __lsx_vld(kptr + 8, 0);
+                        __m128 _w0 = __lsx_vfcvtl_s_h(_w01);
+                        __m128 _w1 = __lsx_vfcvth_s_h(_w01);
+                        __m128 _w2 = __lsx_vfcvtl_s_h(_w23);
+                        __m128 _w3 = __lsx_vfcvth_s_h(_w23);
+                        _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0);
+                        _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1);
+                        _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2);
+                        _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3);
+
+                        m += 4;
+                        kptr += 16;
+                    }
+                    for (; i < num_input; i++)
+                    {
+                        __m128 _val = __lsx_vreplfr2vr_s(m[0]);
+                        __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(kptr, 0));
+                        _sum0 = __lsx_vfmadd_s(_w, _val, _sum0);
+
+                        m += 1;
+                        kptr += 4;
+                    }
+
+                    _sum0 = __lsx_vfadd_s(_sum0, _sum1);
+                    _sum2 = __lsx_vfadd_s(_sum2, _sum3);
+                    _sum0 = __lsx_vfadd_s(_sum0, _sum2);
+
+                    _sum0 = activation_ps(_sum0, activation_type, activation_params);
+
+                    __lsx_vst(_sum0, outptr, 0);
+                    outptr += 4;
+                }
+            }
+
+            if (elempack == 4 && num_output_elempack == 1)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+                    const float* m = bottom_blob.row(j);
+
+                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                    if (bias_term)
+                    {
+                        _sum = __lsx_vreplfr2vr_s(bias_data[p]);
+                    }
+
+                    for (int i = 0; i < num_input; i++)
+                    {
+                        __builtin_prefetch(m + 16);
+                        __builtin_prefetch(kptr + 4);
+                        __m128 _val = (__m128)__lsx_vld(m, 0);
+                        __m128 _k = __lsx_vreplfr2vr_s(float16_to_float32(kptr[0]));
+                        _sum = __lsx_vfmadd_s(_k, _val, _sum);
+
+                        m += 4;
+                        kptr += 1;
+                    }
+
+                    _sum = activation_ps(_sum, activation_type, activation_params);
+
+                    __lsx_vst(_sum, outptr, 0);
+                    outptr += 4;
+                }
+            }
+
+            if (elempack == 1 && num_output_elempack == 1)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+                    const float* m = bottom_blob.row(j);
+
+                    float sum = 0.f;
+
+                    if (bias_term)
+                    {
+                        sum = bias_data[p];
+                    }
+
+                    int i = 0;
+                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+                    for (; i + 3 < num_input; i += 4)
+                    {
+                        __builtin_prefetch(m + 16);
+                        __builtin_prefetch(kptr + 16);
+                        __m128 _m = (__m128)__lsx_vld(m, 0);
+                        __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(kptr, 0));
+                        _sum = __lsx_vfmadd_s(_w, _m, _sum);
+
+                        m += 4;
+                        kptr += 4;
+                    }
+                    sum += __lsx_reduce_fadd_s(_sum);
+                    for (; i < num_input; i++)
+                    {
+                        sum += *m * float16_to_float32(*kptr);
+
+                        m += 1;
+                        kptr += 1;
+                    }
+
+                    sum = activation_ss(sum, activation_type, activation_params);
+
+                    outptr[0] = sum;
+                    outptr += 1;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    // flatten
+    Mat bottom_blob_flattened = bottom_blob;
+    if (bottom_blob.dims != 1)
+    {
+        Option opt_flatten = opt;
+        opt_flatten.blob_allocator = opt.workspace_allocator;
+
+        flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
+    }
+
+    size_t elemsize = bottom_blob_flattened.elemsize;
+    int elempack = bottom_blob_flattened.elempack;
+
+    int out_elempack = 1;
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (out_elempack == 4)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < num_output / out_elempack; p++)
+        {
+            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+
+            if (bias_term)
+            {
+                _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
+            }
+
+            const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
+
+            const float* sptr = bottom_blob_flattened;
+
+            int i = 0;
+            for (; i + 3 < num_input; i += 4)
+            {
+                __builtin_prefetch(sptr + 16);
+                __builtin_prefetch(kptr + 64);
+                __m128i _val = __lsx_vld(sptr, 0);
+                __m128i _w01 = __lsx_vld(kptr, 0);
+                __m128i _w23 = __lsx_vld(kptr + 8, 0);
+                __m128 _w0 = __lsx_vfcvtl_s_h(_w01);
+                __m128 _w1 = __lsx_vfcvth_s_h(_w01);
+                __m128 _w2 = __lsx_vfcvtl_s_h(_w23);
+                __m128 _w3 = __lsx_vfcvth_s_h(_w23);
+                _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0);
+                _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1);
+                _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2);
+                _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3);
+
+                sptr += 4;
+                kptr += 16;
+            }
+            for (; i < num_input; i++)
+            {
+                __m128 _val = __lsx_vreplfr2vr_s(sptr[0]);
+                __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(kptr, 0));
+                _sum0 = __lsx_vfmadd_s(_w, _val, _sum0);
+
+                sptr += 1;
+                kptr += 4;
+            }
+
+            _sum0 = __lsx_vfadd_s(_sum0, _sum1);
+            _sum2 = __lsx_vfadd_s(_sum2, _sum3);
+            _sum0 = __lsx_vfadd_s(_sum0, _sum2);
+
+            _sum0 = activation_ps(_sum0, activation_type, activation_params);
+
+            float* outptr = top_blob;
+            __lsx_vst(_sum0, outptr + p * 4, 0);
+        }
+    }
+
+    if (out_elempack == 1)
+    {
+        int nn_num_output = num_output / 4;
+        int remain_num_output_start = nn_num_output * 4;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int pp = 0; pp < nn_num_output; pp++)
+        {
+            int p = pp * 4;
+
+            float sum0 = 0.f;
+            float sum1 = 0.f;
+            float sum2 = 0.f;
+            float sum3 = 0.f;
+
+            if (bias_term)
+            {
+                sum0 = bias_data[p];
+                sum1 = bias_data[p + 1];
+                sum2 = bias_data[p + 2];
+                sum3 = bias_data[p + 3];
+            }
+
+            const unsigned short* w0 = weight_data_tm.row<const unsigned short>(p);
+            const unsigned short* w1 = weight_data_tm.row<const unsigned short>(p + 1);
+            const unsigned short* w2 = weight_data_tm.row<const unsigned short>(p + 2);
+            const unsigned short* w3 = weight_data_tm.row<const unsigned short>(p + 3);
+
+            const float* m = bottom_blob_flattened;
+
+            int i = 0;
+            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
+            for (; i + 3 < num_input; i += 4)
+            {
+                __builtin_prefetch(m + 16);
+                __builtin_prefetch(w0 + 16);
+                __builtin_prefetch(w1 + 16);
+                __builtin_prefetch(w2 + 16);
+                __builtin_prefetch(w3 + 16);
+                __m128 _m = (__m128)__lsx_vld(m, 0);
+                __m128 _w0 = __lsx_vfcvtl_s_h(__lsx_vld(w0, 0));
+                __m128 _w1 = __lsx_vfcvtl_s_h(__lsx_vld(w1, 0));
+                __m128 _w2 = __lsx_vfcvtl_s_h(__lsx_vld(w2, 0));
+                __m128 _w3 = __lsx_vfcvtl_s_h(__lsx_vld(w3, 0));
+                _sum0 = __lsx_vfmadd_s(_w0, _m, _sum0);
+                _sum1 = __lsx_vfmadd_s(_w1, _m, _sum1);
+                _sum2 = __lsx_vfmadd_s(_w2, _m, _sum2);
+                _sum3 = __lsx_vfmadd_s(_w3, _m, _sum3);
+
+                m += 4;
+                w0 += 4;
+                w1 += 4;
+                w2 += 4;
+                w3 += 4;
+            }
+            for (; i < num_input; i++)
+            {
+                sum0 += *m * float16_to_float32(*w0);
+                sum1 += *m * float16_to_float32(*w1);
+                sum2 += *m * float16_to_float32(*w2);
+                sum3 += *m * float16_to_float32(*w3);
+
+                m++;
+                w0++;
+                w1++;
+                w2++;
+                w3++;
+            }
+
+            sum0 += __lsx_reduce_fadd_s(_sum0);
+            sum1 += __lsx_reduce_fadd_s(_sum1);
+            sum2 += __lsx_reduce_fadd_s(_sum2);
+            sum3 += __lsx_reduce_fadd_s(_sum3);
+
+            sum0 = activation_ss(sum0, activation_type, activation_params);
+            sum1 = activation_ss(sum1, activation_type, activation_params);
+            sum2 = activation_ss(sum2, activation_type, activation_params);
+            sum3 = activation_ss(sum3, activation_type, activation_params);
+
+            top_blob[p] = sum0;
+            top_blob[p + 1] = sum1;
+            top_blob[p + 2] = sum2;
+            top_blob[p + 3] = sum3;
+        }
+
+        // num_output
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = remain_num_output_start; p < num_output; p++)
+        {
+            float sum = 0.f;
+
+            if (bias_term)
+                sum = bias_data[p];
+
+            const unsigned short* w = weight_data_tm.row<const unsigned short>(p);
+
+            const float* m = bottom_blob_flattened;
+
+            int i = 0;
+            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
+            for (; i + 3 < num_input; i += 4)
+            {
+                __builtin_prefetch(m + 16);
+                __builtin_prefetch(w + 16);
+                __m128 _m = (__m128)__lsx_vld(m, 0);
+                __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(w, 0));
+                _sum0 = __lsx_vfmadd_s(_w, _m, _sum0);
+
+                m += 4;
+                w += 4;
+            }
+            sum += __lsx_reduce_fadd_s(_sum0);
+            for (; i < num_input; i++)
+            {
+                sum += *m * float16_to_float32(*w);
+
+                m++;
+                w++;
+            }
+
+            sum = activation_ss(sum, activation_type, activation_params);
+
+            top_blob[p] = sum;
+        }
+    }
+
+    return 0;
+}
+#endif // __loongarch_sx
+
+#if NCNN_INT8
+int InnerProduct_loongarch::create_pipeline_int8_loongarch(const Option& opt)
+{
+    const int num_input = weight_data_size / num_output;
+
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 8 == 0 ? 8 : 1;
+    }
+#endif // __loongarch_sx
+
+    // src = inch-outch
+    // dst = pb-inch-outch/pb
+    {
+        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
+
+        weight_data_tm.create(num_input, num_output / out_elempack, (size_t)out_elempack, out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            signed char* g0 = weight_data_tm.row<signed char>(q / out_elempack);
+
+            for (int p = 0; p < num_input; p++)
+            {
+                for (int j = 0; j < out_elempack; j++)
+                {
+                    *g0++ = weight_data_r2.row<signed char>(q + j)[p];
+                }
+            }
+        }
+    }
+
+    scale_in_data.create(num_output);
+    for (int p = 0; p < num_output; p++)
+    {
+        // dequantize
+        float scale_in;
+        if (weight_data_int8_scales[p] == 0)
+            scale_in = 0;
+        else
+            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
+
+        scale_in_data[p] = scale_in;
+    }
+
+    if (opt.lightmode)
+    {
+        weight_data.release();
+    }
+
+    return 0;
+}
+
+int InnerProduct_loongarch::forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    const int num_input = weight_data_size / num_output;
+
+    int elembits = bottom_blob.elembits();
+
+    Mat bottom_blob_int8 = bottom_blob;
+    if (elembits != 8)
+    {
+        Option opt_q = opt;
+        opt_q.blob_allocator = opt.workspace_allocator;
+        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
+    }
+
+    if (bottom_blob_int8.dims == 2 && bottom_blob_int8.w == num_input && bottom_blob_int8.h * bottom_blob_int8.elempack > 1)
+    {
+        // gemm
+        Mat bottom_blob_int8_unpacked;
+        Option opt_unpack = opt;
+        opt_unpack.blob_allocator = opt.workspace_allocator;
+        convert_packing(bottom_blob_int8, bottom_blob_int8_unpacked, 1, opt_unpack);
+
+        int h = bottom_blob_int8_unpacked.h;
+
+        int out_elempack = 1;
+#if __loongarch_sx
+        if (opt.use_packing_layout)
+        {
+            out_elempack = h % 4 == 0 ? 4 : 1;
+        }
+#endif
+
+        int outh = h / out_elempack;
+
+        top_blob.create(num_output, outh, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        int num_output_elempack = 1;
+#if __loongarch_sx
+        if (opt.use_packing_layout)
+        {
+            num_output_elempack = num_output % 8 == 0 ? 8 : 1;
+        }
+#endif
+
+#if __loongarch_sx
+        if (num_output_elempack == 8 && out_elempack == 4)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < outh; j++)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m0 = bottom_blob_int8_unpacked.row<const signed char>(j * 4);
+                    const signed char* m1 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 1);
+                    const signed char* m2 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 2);
+                    const signed char* m3 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 3);
+
+                    __m128i _sum00 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum01 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum10 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum11 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum20 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum21 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum30 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum31 = __lsx_vreplgr2vr_w(0);
+
+                    int i = 0;
+                    for (; i < num_input; i++)
+                    {
+                        __builtin_prefetch(m0 + 4);
+                        __builtin_prefetch(m1 + 4);
+                        __builtin_prefetch(m2 + 4);
+                        __builtin_prefetch(m3 + 4);
+                        __builtin_prefetch(kptr + 32);
+                        __m128i _val0 = __lsx_vreplgr2vr_h((short)m0[0]);
+                        __m128i _val1 = __lsx_vreplgr2vr_h((short)m1[0]);
+                        __m128i _val2 = __lsx_vreplgr2vr_h((short)m2[0]);
+                        __m128i _val3 = __lsx_vreplgr2vr_h((short)m3[0]);
+
+                        __m128i _w = __lsx_vld(kptr, 0);
+                        __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                        __m128i _s0 = __lsx_vmul_h(_val0, _w16);
+                        __m128i _s1 = __lsx_vmul_h(_val1, _w16);
+                        __m128i _s2 = __lsx_vmul_h(_val2, _w16);
+                        __m128i _s3 = __lsx_vmul_h(_val3, _w16);
+                        __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                        __m128i _exts1 = __lsx_vslti_h(_s1, 0);
+                        __m128i _exts2 = __lsx_vslti_h(_s2, 0);
+                        __m128i _exts3 = __lsx_vslti_h(_s3, 0);
+                        __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                        __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+                        __m128i _s1l = __lsx_vilvl_h(_exts1, _s1);
+                        __m128i _s1h = __lsx_vilvh_h(_exts1, _s1);
+                        __m128i _s2l = __lsx_vilvl_h(_exts2, _s2);
+                        __m128i _s2h = __lsx_vilvh_h(_exts2, _s2);
+                        __m128i _s3l = __lsx_vilvl_h(_exts3, _s3);
+                        __m128i _s3h = __lsx_vilvh_h(_exts3, _s3);
+
+                        _sum00 = __lsx_vadd_w(_sum00, _s0l);
+                        _sum01 = __lsx_vadd_w(_sum01, _s0h);
+                        _sum10 = __lsx_vadd_w(_sum10, _s1l);
+                        _sum11 = __lsx_vadd_w(_sum11, _s1h);
+                        _sum20 = __lsx_vadd_w(_sum20, _s2l);
+                        _sum21 = __lsx_vadd_w(_sum21, _s2h);
+                        _sum30 = __lsx_vadd_w(_sum30, _s3l);
+                        _sum31 = __lsx_vadd_w(_sum31, _s3h);
+
+                        m0++;
+                        m1++;
+                        m2++;
+                        m3++;
+                        kptr += 8;
+                    }
+
+                    // dequantize and relu
+                    __m128 _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8, 0);
+                    __m128 _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8 + 4, 0);
+
+                    __m128 _sumfp32_00 = __lsx_vffint_s_w(_sum00);
+                    __m128 _sumfp32_01 = __lsx_vffint_s_w(_sum01);
+                    __m128 _sumfp32_10 = __lsx_vffint_s_w(_sum10);
+                    __m128 _sumfp32_11 = __lsx_vffint_s_w(_sum11);
+                    __m128 _sumfp32_20 = __lsx_vffint_s_w(_sum20);
+                    __m128 _sumfp32_21 = __lsx_vffint_s_w(_sum21);
+                    __m128 _sumfp32_30 = __lsx_vffint_s_w(_sum30);
+                    __m128 _sumfp32_31 = __lsx_vffint_s_w(_sum31);
+                    if (bias_term)
+                    {
+                        __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + p * 8, 0);
+                        __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + p * 8 + 4, 0);
+                        _sumfp32_00 = __lsx_vfmadd_s(_scale_in0, _sumfp32_00, _bias0);
+                        _sumfp32_01 = __lsx_vfmadd_s(_scale_in1, _sumfp32_01, _bias1);
+                        _sumfp32_10 = __lsx_vfmadd_s(_scale_in0, _sumfp32_10, _bias0);
+                        _sumfp32_11 = __lsx_vfmadd_s(_scale_in1, _sumfp32_11, _bias1);
+                        _sumfp32_20 = __lsx_vfmadd_s(_scale_in0, _sumfp32_20, _bias0);
+                        _sumfp32_21 = __lsx_vfmadd_s(_scale_in1, _sumfp32_21, _bias1);
+                        _sumfp32_30 = __lsx_vfmadd_s(_scale_in0, _sumfp32_30, _bias0);
+                        _sumfp32_31 = __lsx_vfmadd_s(_scale_in1, _sumfp32_31, _bias1);
+                    }
+                    else
+                    {
+                        _sumfp32_00 = __lsx_vfmul_s(_sumfp32_00, _scale_in0);
+                        _sumfp32_01 = __lsx_vfmul_s(_sumfp32_01, _scale_in1);
+                        _sumfp32_10 = __lsx_vfmul_s(_sumfp32_10, _scale_in0);
+                        _sumfp32_11 = __lsx_vfmul_s(_sumfp32_11, _scale_in1);
+                        _sumfp32_20 = __lsx_vfmul_s(_sumfp32_20, _scale_in0);
+                        _sumfp32_21 = __lsx_vfmul_s(_sumfp32_21, _scale_in1);
+                        _sumfp32_30 = __lsx_vfmul_s(_sumfp32_30, _scale_in0);
+                        _sumfp32_31 = __lsx_vfmul_s(_sumfp32_31, _scale_in1);
+                    }
+
+                    _sumfp32_00 = activation_ps(_sumfp32_00, activation_type, activation_params);
+                    _sumfp32_01 = activation_ps(_sumfp32_01, activation_type, activation_params);
+                    _sumfp32_10 = activation_ps(_sumfp32_10, activation_type, activation_params);
+                    _sumfp32_11 = activation_ps(_sumfp32_11, activation_type, activation_params);
+                    _sumfp32_20 = activation_ps(_sumfp32_20, activation_type, activation_params);
+                    _sumfp32_21 = activation_ps(_sumfp32_21, activation_type, activation_params);
+                    _sumfp32_30 = activation_ps(_sumfp32_30, activation_type, activation_params);
+                    _sumfp32_31 = activation_ps(_sumfp32_31, activation_type, activation_params);
+
+                    // transpose 4x8
+                    __m128i _r01r = __lsx_vilvl_w((__m128i)_sumfp32_10, (__m128i)_sumfp32_00);
+                    __m128i _r01l = __lsx_vilvh_w((__m128i)_sumfp32_10, (__m128i)_sumfp32_00);
+                    __m128i _r23r = __lsx_vilvl_w((__m128i)_sumfp32_30, (__m128i)_sumfp32_20);
+                    __m128i _r23l = __lsx_vilvh_w((__m128i)_sumfp32_30, (__m128i)_sumfp32_20);
+                    __m128i _r45r = __lsx_vilvl_w((__m128i)_sumfp32_11, (__m128i)_sumfp32_01);
+                    __m128i _r45l = __lsx_vilvh_w((__m128i)_sumfp32_11, (__m128i)_sumfp32_01);
+                    __m128i _r67r = __lsx_vilvl_w((__m128i)_sumfp32_31, (__m128i)_sumfp32_21);
+                    __m128i _r67l = __lsx_vilvh_w((__m128i)_sumfp32_31, (__m128i)_sumfp32_21);
+                    _sumfp32_00 = (__m128)__lsx_vilvl_d(_r23r, _r01r);
+                    _sumfp32_10 = (__m128)__lsx_vilvh_d(_r23r, _r01r);
+                    _sumfp32_20 = (__m128)__lsx_vilvl_d(_r23l, _r01l);
+                    _sumfp32_30 = (__m128)__lsx_vilvh_d(_r23l, _r01l);
+                    _sumfp32_01 = (__m128)__lsx_vilvl_d(_r67r, _r45r);
+                    _sumfp32_11 = (__m128)__lsx_vilvh_d(_r67r, _r45r);
+                    _sumfp32_21 = (__m128)__lsx_vilvl_d(_r67l, _r45l);
+                    _sumfp32_31 = (__m128)__lsx_vilvh_d(_r67l, _r45l);
+
+                    __lsx_vst(_sumfp32_00, outptr, 0);
+                    __lsx_vst(_sumfp32_10, outptr + 4, 0);
+                    __lsx_vst(_sumfp32_20, outptr + 8, 0);
+                    __lsx_vst(_sumfp32_30, outptr + 12, 0);
+                    __lsx_vst(_sumfp32_01, outptr + 16, 0);
+                    __lsx_vst(_sumfp32_11, outptr + 20, 0);
+                    __lsx_vst(_sumfp32_21, outptr + 24, 0);
+                    __lsx_vst(_sumfp32_31, outptr + 28, 0);
+
+                    outptr += 32;
+                }
+            }
+        }
+
+        if (num_output_elempack == 1 && out_elempack == 4)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < outh; j++)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m0 = bottom_blob_int8_unpacked.row<const signed char>(j * 4);
+                    const signed char* m1 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 1);
+                    const signed char* m2 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 2);
+                    const signed char* m3 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 3);
+
+                    int sum0 = 0;
+                    int sum1 = 0;
+                    int sum2 = 0;
+                    int sum3 = 0;
+
+                    int i = 0;
+                    for (; i < num_input; i++)
+                    {
+                        sum0 += *m0++ * kptr[0];
+                        sum1 += *m1++ * kptr[0];
+                        sum2 += *m2++ * kptr[0];
+                        sum3 += *m3++ * kptr[0];
+                        kptr += 1;
+                    }
+
+                    // dequantize and relu
+                    float sumfp32_0 = sum0 * scale_in_data[p];
+                    float sumfp32_1 = sum1 * scale_in_data[p];
+                    float sumfp32_2 = sum2 * scale_in_data[p];
+                    float sumfp32_3 = sum3 * scale_in_data[p];
+
+                    if (bias_term)
+                    {
+                        sumfp32_0 += bias_data[p];
+                        sumfp32_1 += bias_data[p];
+                        sumfp32_2 += bias_data[p];
+                        sumfp32_3 += bias_data[p];
+                    }
+
+                    outptr[0] = activation_ss(sumfp32_0, activation_type, activation_params);
+                    outptr[1] = activation_ss(sumfp32_1, activation_type, activation_params);
+                    outptr[2] = activation_ss(sumfp32_2, activation_type, activation_params);
+                    outptr[3] = activation_ss(sumfp32_3, activation_type, activation_params);
+                    outptr += 4;
+                }
+            }
+        }
+
+        if (num_output_elempack == 8 && out_elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < outh; j++)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m = bottom_blob_int8_unpacked.row<const signed char>(j);
+
+                    __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+                    __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+                    int i = 0;
+                    for (; i < num_input; i++)
+                    {
+                        __builtin_prefetch(m + 4);
+                        __builtin_prefetch(kptr + 32);
+                        __m128i _val = __lsx_vreplgr2vr_h((short)m[0]);
+
+                        __m128i _w = __lsx_vld(kptr, 0);
+                        __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                        __m128i _s0 = __lsx_vmul_h(_val, _w16);
+                        __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                        __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                        __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+
+                        _sum0 = __lsx_vadd_w(_sum0, _s0l);
+                        _sum1 = __lsx_vadd_w(_sum1, _s0h);
+
+                        m++;
+                        kptr += 8;
+                    }
+
+                    // dequantize and relu
+                    __m128 _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8, 0);
+                    __m128 _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8 + 4, 0);
+
+                    __m128 _sumfp32_0 = __lsx_vffint_s_w(_sum0);
+                    __m128 _sumfp32_1 = __lsx_vffint_s_w(_sum1);
+
+                    if (bias_term)
+                    {
+                        __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + p * 8, 0);
+                        __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + p * 8 + 4, 0);
+                        _sumfp32_0 = __lsx_vfmadd_s(_scale_in0, _sumfp32_0, _bias0);
+                        _sumfp32_1 = __lsx_vfmadd_s(_scale_in1, _sumfp32_1, _bias1);
+                    }
+                    else
+                    {
+                        _sumfp32_0 = __lsx_vfmul_s(_sumfp32_0, _scale_in0);
+                        _sumfp32_1 = __lsx_vfmul_s(_sumfp32_1, _scale_in1);
+                    }
+
+                    _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params);
+                    _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params);
+
+                    __lsx_vst(_sumfp32_0, outptr, 0);
+                    __lsx_vst(_sumfp32_1, outptr + 4, 0);
+                    outptr += 8;
+                }
+            }
+        }
+#endif // __loongarch_sx
+
+        if (num_output_elempack == 1 && out_elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < outh; j++)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m = bottom_blob_int8_unpacked.row<const signed char>(j);
+
+                    int sum = 0;
+
+                    int i = 0;
+                    for (; i < num_input; i++)
+                    {
+                        sum += *m++ * *kptr++;
+                    }
+
+                    // dequantize and relu
+                    float sumfp32 = sum * scale_in_data[p];
+
+                    if (bias_term)
+                        sumfp32 += bias_data[p];
+
+                    outptr[0] = activation_ss(sumfp32, activation_type, activation_params);
+                    outptr += 1;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    Mat bottom_blob_int8_flattened = bottom_blob_int8;
+    if (bottom_blob_int8.dims != 1)
+    {
+        Option opt_flatten = opt;
+        opt_flatten.blob_allocator = opt.workspace_allocator;
+        flatten->forward(bottom_blob_int8, bottom_blob_int8_flattened, opt_flatten);
+    }
+
+    //     int elempack = bottom_blob_int8_flattened.elempack;
+
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 8 == 0 ? 8 : 1;
+    }
+#endif // __loongarch_sx
+    //     size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    top_blob.create(num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+#if __loongarch_sx
+    if (out_elempack == 8)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < num_output / out_elempack; p++)
+        {
+            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
+            __m128i _sum1 = __lsx_vreplgr2vr_w(0);
+
+            const signed char* kptr = weight_data_tm.row<const signed char>(p);
+            const signed char* sptr = bottom_blob_int8_flattened;
+
+            int i = 0;
+            for (; i < num_input; i++)
+            {
+                __builtin_prefetch(sptr + 4);
+                __builtin_prefetch(kptr + 32);
+                __m128i _val = __lsx_vreplgr2vr_h((short)sptr[0]);
+
+                __m128i _w = __lsx_vld(kptr, 0);
+                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);
+
+                __m128i _s0 = __lsx_vmul_h(_val, _w16);
+                __m128i _exts0 = __lsx_vslti_h(_s0, 0);
+                __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
+                __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
+
+                _sum0 = __lsx_vadd_w(_sum0, _s0l);
+                _sum1 = __lsx_vadd_w(_sum1, _s0h);
+
+                sptr += 1;
+                kptr += 8;
+            }
+
+            // dequantize and relu
+            __m128 _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8, 0);
+            __m128 _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8 + 4, 0);
+
+            __m128 _sumfp32_0 = __lsx_vffint_s_w(_sum0);
+            __m128 _sumfp32_1 = __lsx_vffint_s_w(_sum1);
+
+            if (bias_term)
+            {
+                __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + p * 8, 0);
+                __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + p * 8 + 4, 0);
+                _sumfp32_0 = __lsx_vfmadd_s(_scale_in0, _sumfp32_0, _bias0);
+                _sumfp32_1 = __lsx_vfmadd_s(_scale_in1, _sumfp32_1, _bias1);
+            }
+            else
+            {
+                _sumfp32_0 = __lsx_vfmul_s(_sumfp32_0, _scale_in0);
+                _sumfp32_1 = __lsx_vfmul_s(_sumfp32_1, _scale_in1);
+            }
+
+            _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params);
+            _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params);
+
+            float* outptr = (float*)top_blob + p * 8;
+            __lsx_vst(_sumfp32_0, outptr, 0);
+            __lsx_vst(_sumfp32_1, outptr + 4, 0);
+        }
+    }
+#endif // __loongarch_sx
+
+    if (out_elempack == 1)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < num_output / out_elempack; p++)
+        {
+            int sum = 0;
+
+            const signed char* kptr = weight_data_tm.row<const signed char>(p);
+            const signed char* sptr = bottom_blob_int8_flattened;
+
+            int i = 0;
+            for (; i < num_input; i++)
+            {
+                signed char val = sptr[0];
+
+                signed char w = kptr[0];
+
+                sum += val * w;
+
+                sptr += 1;
+                kptr += 1;
+            }
+
+            // dequantize and relu
+            float sumfp32 = sum * scale_in_data[p];
+
+            if (bias_term)
+                sumfp32 += bias_data[p];
+
+            sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
+
+            top_blob[p] = sumfp32;
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_INT8
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/innerproduct_loongarch.h b/src/layer/loongarch/innerproduct_loongarch.h
new file mode 100644
index 00000000000..4d9574ce919
--- /dev/null
+++ b/src/layer/loongarch/innerproduct_loongarch.h
@@ -0,0 +1,54 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_INNERPRODUCT_LOONGARCH_H
+#define LAYER_INNERPRODUCT_LOONGARCH_H
+
+#include "innerproduct.h"
+
+namespace ncnn {
+
+class InnerProduct_loongarch : virtual public InnerProduct
+{
+public:
+    InnerProduct_loongarch();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+protected:
+#if __loongarch_sx
+    int create_pipeline_fp16s(const Option& opt);
+    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
+#if NCNN_INT8
+    int create_pipeline_int8_loongarch(const Option& opt);
+    int forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
+
+public:
+    Layer* flatten;
+
+    Mat weight_data_tm;
+
+#if NCNN_INT8
+    Mat scale_in_data;
+#endif
+};
+
+} // namespace ncnn
+
+#endif // LAYER_INNERPRODUCT_LOONGARCH_H
diff --git a/src/layer/loongarch/interp_bicubic.h b/src/layer/loongarch/interp_bicubic.h
new file mode 100644
index 00000000000..e52ba81de4f
--- /dev/null
+++ b/src/layer/loongarch/interp_bicubic.h
@@ -0,0 +1,261 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static inline void interpolate_cubic(float fx, float* coeffs)
+{
+    const float A = -0.75f;
+
+    float fx0 = fx + 1;
+    float fx1 = fx;
+    float fx2 = 1 - fx;
+    // float fx3 = 2 - fx;
+
+    coeffs[0] = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A;
+    coeffs[1] = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1;
+    coeffs[2] = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+
+static void cubic_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner)
+{
+    double scale = (double)w / outw;
+    if (align_corner)
+    {
+        scale = (double)(w - 1) / (outw - 1);
+    }
+
+    for (int dx = 0; dx < outw; dx++)
+    {
+        float fx = (float)((dx + 0.5) * scale - 0.5);
+        if (align_corner)
+        {
+            fx = (float)(dx * scale);
+        }
+
+        int sx = static_cast<int>(floor(fx));
+        fx -= sx;
+
+        interpolate_cubic(fx, alpha + dx * 4);
+
+        if (sx <= -1)
+        {
+            sx = 1;
+            alpha[dx * 4 + 0] = 1.f - alpha[dx * 4 + 3];
+            alpha[dx * 4 + 1] = alpha[dx * 4 + 3];
+            alpha[dx * 4 + 2] = 0.f;
+            alpha[dx * 4 + 3] = 0.f;
+        }
+        if (sx == 0)
+        {
+            sx = 1;
+            alpha[dx * 4 + 0] = alpha[dx * 4 + 0] + alpha[dx * 4 + 1];
+            alpha[dx * 4 + 1] = alpha[dx * 4 + 2];
+            alpha[dx * 4 + 2] = alpha[dx * 4 + 3];
+            alpha[dx * 4 + 3] = 0.f;
+        }
+        if (sx == w - 2)
+        {
+            sx = w - 3;
+            alpha[dx * 4 + 3] = alpha[dx * 4 + 2] + alpha[dx * 4 + 3];
+            alpha[dx * 4 + 2] = alpha[dx * 4 + 1];
+            alpha[dx * 4 + 1] = alpha[dx * 4 + 0];
+            alpha[dx * 4 + 0] = 0.f;
+        }
+        if (sx >= w - 1)
+        {
+            sx = w - 3;
+            alpha[dx * 4 + 3] = 1.f - alpha[dx * 4 + 0];
+            alpha[dx * 4 + 2] = alpha[dx * 4 + 0];
+            alpha[dx * 4 + 1] = 0.f;
+            alpha[dx * 4 + 0] = 0.f;
+        }
+
+        xofs[dx] = sx;
+    }
+}
+
+static void resize_bicubic_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w);
+    Mat rowsbuf1(w);
+    Mat rowsbuf2(w);
+    Mat rowsbuf3(w);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+    float* rows2 = rowsbuf2;
+    float* rows3 = rowsbuf3;
+
+    int prev_sy1 = -3;
+
+    for (int dy = 0; dy < h; dy++)
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows2;
+            rows2 = rows3;
+            rows3 = rows0_old;
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 2)
+        {
+            // hresize two rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            rows0 = rows2;
+            rows1 = rows3;
+            rows2 = rows0_old;
+            rows3 = rows1_old;
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
+                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 3)
+        {
+            // hresize three rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            float* rows2_old = rows2;
+            rows0 = rows3;
+            rows1 = rows0_old;
+            rows2 = rows1_old;
+            rows3 = rows2_old;
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3;
+                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
+                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;
+
+                alphap += 4;
+            }
+        }
+        else
+        {
+            // hresize four rows
+            const float* S0 = src.row(sy - 1);
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows0p[dx] = S0p[-1] * a0 + S0p[0] * a1 + S0p[1] * a2 + S0p[2] * a3;
+                rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3;
+                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
+                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;
+
+                alphap += 4;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        float b0 = beta[0];
+        float b1 = beta[1];
+        float b2 = beta[2];
+        float b3 = beta[3];
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* rows2p = rows2;
+        float* rows3p = rows3;
+        float* Dp = dst.row(dy);
+        for (int dx = 0; dx < w; dx++)
+        {
+            //             D[x] = rows0[x]*b0 + rows1[x]*b1 + rows2[x]*b2 + rows3[x]*b3;
+            *Dp++ = *rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3;
+        }
+
+        beta += 4;
+    }
+}
diff --git a/src/layer/loongarch/interp_bicubic_pack4.h b/src/layer/loongarch/interp_bicubic_pack4.h
new file mode 100644
index 00000000000..54281691ad7
--- /dev/null
+++ b/src/layer/loongarch/interp_bicubic_pack4.h
@@ -0,0 +1,286 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void resize_bicubic_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf2(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf3(w, (size_t)4 * 4u, 4);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+    float* rows2 = rowsbuf2;
+    float* rows3 = rowsbuf3;
+
+    int prev_sy1 = -3;
+
+    for (int dy = 0; dy < h; dy++)
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows2;
+            rows2 = rows3;
+            rows3 = rows0_old;
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S3p = S3 + sx;
+
+                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
+                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
+                __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]);
+                __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]);
+
+                __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0);
+                __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0);
+                __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0);
+                __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0);
+                __m128 _rows3 = __lsx_vfmul_s(_S30, _a0);
+                _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3);
+                _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3);
+                _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3);
+                __lsx_vst(_rows3, rows3p + dx * 4, 0);
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 2)
+        {
+            // hresize two rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            rows0 = rows2;
+            rows1 = rows3;
+            rows2 = rows0_old;
+            rows3 = rows1_old;
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
+                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
+                __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]);
+                __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]);
+
+                __m128 _S20 = (__m128)__lsx_vld(S2p - 4, 0);
+                __m128 _S21 = (__m128)__lsx_vld(S2p + 0, 0);
+                __m128 _S22 = (__m128)__lsx_vld(S2p + 4, 0);
+                __m128 _S23 = (__m128)__lsx_vld(S2p + 8, 0);
+                __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0);
+                __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0);
+                __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0);
+                __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0);
+                __m128 _rows2 = __lsx_vfmul_s(_S20, _a0);
+                __m128 _rows3 = __lsx_vfmul_s(_S30, _a0);
+                _rows2 = __lsx_vfmadd_s(_a1, _S21, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3);
+                _rows2 = __lsx_vfmadd_s(_a2, _S22, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3);
+                _rows2 = __lsx_vfmadd_s(_a3, _S23, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3);
+                __lsx_vst(_rows2, rows2p + dx * 4, 0);
+                __lsx_vst(_rows3, rows3p + dx * 4, 0);
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 3)
+        {
+            // hresize three rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            float* rows2_old = rows2;
+            rows0 = rows3;
+            rows1 = rows0_old;
+            rows2 = rows1_old;
+            rows3 = rows2_old;
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
+                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
+                __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]);
+                __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]);
+
+                __m128 _S10 = (__m128)__lsx_vld(S1p - 4, 0);
+                __m128 _S11 = (__m128)__lsx_vld(S1p + 0, 0);
+                __m128 _S12 = (__m128)__lsx_vld(S1p + 4, 0);
+                __m128 _S13 = (__m128)__lsx_vld(S1p + 8, 0);
+                __m128 _S20 = (__m128)__lsx_vld(S2p - 4, 0);
+                __m128 _S21 = (__m128)__lsx_vld(S2p + 0, 0);
+                __m128 _S22 = (__m128)__lsx_vld(S2p + 4, 0);
+                __m128 _S23 = (__m128)__lsx_vld(S2p + 8, 0);
+                __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0);
+                __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0);
+                __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0);
+                __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0);
+                __m128 _rows1 = __lsx_vfmul_s(_S10, _a0);
+                __m128 _rows2 = __lsx_vfmul_s(_S20, _a0);
+                __m128 _rows3 = __lsx_vfmul_s(_S30, _a0);
+                _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1);
+                _rows2 = __lsx_vfmadd_s(_a1, _S21, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3);
+                _rows1 = __lsx_vfmadd_s(_a2, _S12, _rows1);
+                _rows2 = __lsx_vfmadd_s(_a2, _S22, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3);
+                _rows1 = __lsx_vfmadd_s(_a3, _S13, _rows1);
+                _rows2 = __lsx_vfmadd_s(_a3, _S23, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3);
+                __lsx_vst(_rows1, rows1p + dx * 4, 0);
+                __lsx_vst(_rows2, rows2p + dx * 4, 0);
+                __lsx_vst(_rows3, rows3p + dx * 4, 0);
+
+                alphap += 4;
+            }
+        }
+        else
+        {
+            // hresize four rows
+            const float* S0 = src.row(sy - 1);
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
+                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
+                __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]);
+                __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]);
+
+                __m128 _S00 = (__m128)__lsx_vld(S0p - 4, 0);
+                __m128 _S01 = (__m128)__lsx_vld(S0p + 0, 0);
+                __m128 _S02 = (__m128)__lsx_vld(S0p + 4, 0);
+                __m128 _S03 = (__m128)__lsx_vld(S0p + 8, 0);
+                __m128 _S10 = (__m128)__lsx_vld(S1p - 4, 0);
+                __m128 _S11 = (__m128)__lsx_vld(S1p + 0, 0);
+                __m128 _S12 = (__m128)__lsx_vld(S1p + 4, 0);
+                __m128 _S13 = (__m128)__lsx_vld(S1p + 8, 0);
+                __m128 _S20 = (__m128)__lsx_vld(S2p - 4, 0);
+                __m128 _S21 = (__m128)__lsx_vld(S2p + 0, 0);
+                __m128 _S22 = (__m128)__lsx_vld(S2p + 4, 0);
+                __m128 _S23 = (__m128)__lsx_vld(S2p + 8, 0);
+                __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0);
+                __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0);
+                __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0);
+                __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0);
+                __m128 _rows0 = __lsx_vfmul_s(_S00, _a0);
+                __m128 _rows1 = __lsx_vfmul_s(_S10, _a0);
+                __m128 _rows2 = __lsx_vfmul_s(_S20, _a0);
+                __m128 _rows3 = __lsx_vfmul_s(_S30, _a0);
+                _rows0 = __lsx_vfmadd_s(_a1, _S01, _rows0);
+                _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1);
+                _rows2 = __lsx_vfmadd_s(_a1, _S21, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3);
+                _rows0 = __lsx_vfmadd_s(_a2, _S02, _rows0);
+                _rows1 = __lsx_vfmadd_s(_a2, _S12, _rows1);
+                _rows2 = __lsx_vfmadd_s(_a2, _S22, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3);
+                _rows0 = __lsx_vfmadd_s(_a3, _S03, _rows0);
+                _rows1 = __lsx_vfmadd_s(_a3, _S13, _rows1);
+                _rows2 = __lsx_vfmadd_s(_a3, _S23, _rows2);
+                _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3);
+                __lsx_vst(_rows0, rows0p + dx * 4, 0);
+                __lsx_vst(_rows1, rows1p + dx * 4, 0);
+                __lsx_vst(_rows2, rows2p + dx * 4, 0);
+                __lsx_vst(_rows3, rows3p + dx * 4, 0);
+
+                alphap += 4;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        __m128 _b0 = __lsx_vreplfr2vr_s(beta[0]);
+        __m128 _b1 = __lsx_vreplfr2vr_s(beta[1]);
+        __m128 _b2 = __lsx_vreplfr2vr_s(beta[2]);
+        __m128 _b3 = __lsx_vreplfr2vr_s(beta[3]);
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* rows2p = rows2;
+        float* rows3p = rows3;
+        float* Dp = dst.row(dy);
+
+        for (int dx = 0; dx < w; dx++)
+        {
+            __m128 _rows0 = (__m128)__lsx_vld(rows0p, 0);
+            __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0);
+            __m128 _rows2 = (__m128)__lsx_vld(rows2p, 0);
+            __m128 _rows3 = (__m128)__lsx_vld(rows3p, 0);
+            __m128 _D = __lsx_vfmul_s(_rows0, _b0);
+            _D = __lsx_vfmadd_s(_b1, _rows1, _D);
+            _D = __lsx_vfmadd_s(_b2, _rows2, _D);
+            _D = __lsx_vfmadd_s(_b3, _rows3, _D);
+            __lsx_vst(_D, Dp, 0);
+
+            Dp += 4;
+            rows0p += 4;
+            rows1p += 4;
+            rows2p += 4;
+            rows3p += 4;
+        }
+
+        beta += 4;
+    }
+}
diff --git a/src/layer/loongarch/interp_bilinear.h b/src/layer/loongarch/interp_bilinear.h
new file mode 100644
index 00000000000..ad5a28672be
--- /dev/null
+++ b/src/layer/loongarch/interp_bilinear.h
@@ -0,0 +1,172 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void linear_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner)
+{
+    double scale = (double)w / outw;
+    if (align_corner)
+    {
+        scale = (double)(w - 1) / (outw - 1);
+    }
+
+    for (int dx = 0; dx < outw; dx++)
+    {
+        float fx = (float)((dx + 0.5) * scale - 0.5);
+        if (align_corner)
+        {
+            fx = (float)(dx * scale);
+        }
+
+        int sx = floor(fx);
+        fx -= sx;
+
+        if (sx < 0)
+        {
+            sx = 0;
+            fx = 0.f;
+        }
+        if (sx >= w - 1)
+        {
+            sx = w - 2;
+            fx = 1.f;
+        }
+
+        xofs[dx] = sx;
+
+        alpha[dx * 2] = 1.f - fx;
+        alpha[dx * 2 + 1] = fx;
+    }
+}
+
+static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w);
+    Mat rowsbuf1(w);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+
+    int prev_sy1 = -2;
+
+    for (int dy = 0; dy < h; dy++)
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows0_old;
+            const float* S1 = src.row(sy + 1);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            int dx = 0;
+            for (; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S1p = S1 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
+
+                alphap += 2;
+            }
+        }
+        else
+        {
+            // hresize two rows
+            const float* S0 = src.row(sy);
+            const float* S1 = src.row(sy + 1);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            int dx = 0;
+            for (; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                rows0p[dx] = S0p[0] * a0 + S0p[1] * a1;
+                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
+
+                alphap += 2;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        float b0 = beta[0];
+        float b1 = beta[1];
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* Dp = dst.row(dy);
+
+#if __loongarch_sx
+        int nn = w >> 3;
+#else
+        int nn = 0;
+#endif
+        int remain = w - (nn << 3);
+
+#if __loongarch_sx
+        __m128 _b0 = __lsx_vreplfr2vr_s(b0);
+        __m128 _b1 = __lsx_vreplfr2vr_s(b1);
+        for (; nn > 0; nn--)
+        {
+            __m128 _rows0 = (__m128)__lsx_vld(rows0p, 0);
+            __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0);
+
+            __m128 _D = __lsx_vfmul_s(_rows0, _b0);
+            _D = __lsx_vfmadd_s(_b1, _rows1, _D);
+
+            __lsx_vst(_D, Dp, 0);
+
+            __m128 _rows0n = (__m128)__lsx_vld(rows0p + 4, 0);
+            __m128 _rows1n = (__m128)__lsx_vld(rows1p + 4, 0);
+
+            __m128 _Dn = __lsx_vfmul_s(_rows0n, _b0);
+            _Dn = __lsx_vfmadd_s(_b1, _rows1n, _Dn);
+
+            __lsx_vst(_Dn, Dp + 4, 0);
+
+            Dp += 8;
+            rows0p += 8;
+            rows1p += 8;
+        }
+#endif // __loongarch_sx
+        for (; remain; --remain)
+        {
+            //             D[x] = rows0[x]*b0 + rows1[x]*b1;
+            *Dp++ = *rows0p++ * b0 + *rows1p++ * b1;
+        }
+
+        beta += 2;
+    }
+}
diff --git a/src/layer/loongarch/interp_bilinear_pack4.h b/src/layer/loongarch/interp_bilinear_pack4.h
new file mode 100644
index 00000000000..2cfb138a1cb
--- /dev/null
+++ b/src/layer/loongarch/interp_bilinear_pack4.h
@@ -0,0 +1,123 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void resize_bilinear_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+
+    int prev_sy1 = -2;
+
+    for (int dy = 0; dy < h; dy++)
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows0_old;
+            const float* S1 = src.row(sy + 1);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            int dx = 0;
+            for (; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S1p = S1 + sx;
+
+                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
+                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
+
+                __m128 _S10 = (__m128)__lsx_vld(S1p, 0);
+                __m128 _S11 = (__m128)__lsx_vld(S1p + 4, 0);
+                __m128 _rows1 = __lsx_vfmul_s(_S10, _a0);
+                _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1);
+                __lsx_vst(_rows1, rows1p + dx * 4, 0);
+
+                alphap += 2;
+            }
+        }
+        else
+        {
+            // hresize two rows
+            const float* S0 = src.row(sy);
+            const float* S1 = src.row(sy + 1);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            int dx = 0;
+            for (; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+
+                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
+                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
+
+                __m128 _S00 = (__m128)__lsx_vld(S0p, 0);
+                __m128 _S01 = (__m128)__lsx_vld(S0p + 4, 0);
+                __m128 _S10 = (__m128)__lsx_vld(S1p, 0);
+                __m128 _S11 = (__m128)__lsx_vld(S1p + 4, 0);
+                __m128 _rows0 = __lsx_vfmul_s(_S00, _a0);
+                __m128 _rows1 = __lsx_vfmul_s(_S10, _a0);
+                _rows0 = __lsx_vfmadd_s(_a1, _S01, _rows0);
+                _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1);
+                __lsx_vst(_rows0, rows0p + dx * 4, 0);
+                __lsx_vst(_rows1, rows1p + dx * 4, 0);
+
+                alphap += 2;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        __m128 _b0 = __lsx_vreplfr2vr_s(beta[0]);
+        __m128 _b1 = __lsx_vreplfr2vr_s(beta[1]);
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* Dp = dst.row(dy);
+
+        for (int dx = 0; dx < w; dx++)
+        {
+            __m128 _rows0 = (__m128)__lsx_vld(rows0p, 0);
+            __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0);
+            __m128 _D = __lsx_vfmul_s(_rows0, _b0);
+            _D = __lsx_vfmadd_s(_b1, _rows1, _D);
+            __lsx_vst(_D, Dp, 0);
+
+            Dp += 4;
+            rows0p += 4;
+            rows1p += 4;
+        }
+
+        beta += 2;
+    }
+}
diff --git a/src/layer/loongarch/interp_loongarch.cpp b/src/layer/loongarch/interp_loongarch.cpp
new file mode 100644
index 00000000000..94d25cf005e
--- /dev/null
+++ b/src/layer/loongarch/interp_loongarch.cpp
@@ -0,0 +1,470 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "interp_loongarch.h"
+
+#include <math.h>
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+#include "interp_bicubic.h"
+#include "interp_bilinear.h"
+
+#if __loongarch_sx
+#include "interp_bicubic_pack4.h"
+#include "interp_bilinear_pack4.h"
+#endif
+
+Interp_loongarch::Interp_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Interp_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& reference_blob = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    int h = bottom_blob.h;
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = reference_blob.w;
+    int outh = reference_blob.h;
+
+    if (dims == 1)
+    {
+        top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+#if __loongarch_sx
+        if (elempack == 4)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < w; q++)
+            {
+                Mat top_blob_c = top_blob.channel(q);
+                __m128 _v = (__m128)__lsx_vld((const float*)bottom_blob + q * 4, 0);
+                top_blob_c.fill(_v);
+            }
+
+            return 0;
+        }
+#endif // __loongarch_sx
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < w; q++)
+        {
+            Mat top_blob_c = top_blob.channel(q);
+            const float v = bottom_blob[q];
+            top_blob_c.fill(v);
+        }
+
+        return 0;
+    }
+
+    if (dims == 2)
+    {
+        if (outw == w)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+
+        top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+#if __loongarch_sx
+        if (elempack == 4)
+        {
+            if (resize_type == 1) // nearest
+            {
+                const float ws = output_width ? w / (float)outw : 1.f / width_scale;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int y = 0; y < h; y++)
+                {
+                    const float* ptr = bottom_blob.row(y);
+                    float* outptr = top_blob.row(y);
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int in_x = std::min((int)(x * ws), (w - 1));
+
+                        __m128 _p = (__m128)__lsx_vld(ptr + in_x * 4, 0);
+                        __lsx_vst(_p, outptr, 0);
+
+                        outptr += 4;
+                    }
+                }
+            }
+
+            if (resize_type == 2) // bilinear
+            {
+                int* buf = new int[outw + outw * 2];
+
+                int* xofs = buf;
+                float* alpha = (float*)(buf + outw);
+
+                linear_coeffs(w, outw, xofs, alpha, align_corner);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int y = 0; y < h; y++)
+                {
+                    const float* ptr = bottom_blob.row(y);
+                    float* outptr = top_blob.row(y);
+                    const float* alphap = alpha;
+
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int sx = xofs[x] * 4;
+                        const float* Sp = ptr + sx;
+
+                        __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
+                        __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
+
+                        __m128 _S0 = (__m128)__lsx_vld(Sp, 0);
+                        __m128 _S1 = (__m128)__lsx_vld(Sp + 4, 0);
+                        __m128 _p = __lsx_vfmul_s(_S0, _a0);
+                        _p = __lsx_vfmadd_s(_a1, _S1, _p);
+                        __lsx_vst(_p, outptr, 0);
+
+                        alphap += 2;
+                        outptr += 4;
+                    }
+                }
+
+                delete[] buf;
+            }
+
+            if (resize_type == 3) // bicubic
+            {
+                int* buf = new int[outw + outw * 4];
+
+                int* xofs = buf;
+                float* alpha = (float*)(buf + outw);
+
+                cubic_coeffs(w, outw, xofs, alpha, align_corner);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int y = 0; y < h; y++)
+                {
+                    const float* ptr = bottom_blob.row(y);
+                    float* outptr = top_blob.row(y);
+                    const float* alphap = alpha;
+
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int sx = xofs[x] * 4;
+                        const float* Sp = ptr + sx;
+
+                        __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
+                        __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
+                        __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]);
+                        __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]);
+
+                        __m128 _S0 = (__m128)__lsx_vld(Sp - 4, 0);
+                        __m128 _S1 = (__m128)__lsx_vld(Sp + 0, 0);
+                        __m128 _S2 = (__m128)__lsx_vld(Sp + 4, 0);
+                        __m128 _S3 = (__m128)__lsx_vld(Sp + 8, 0);
+                        __m128 _p = __lsx_vfmul_s(_S0, _a0);
+                        _p = __lsx_vfmadd_s(_a1, _S1, _p);
+                        _p = __lsx_vfmadd_s(_a2, _S2, _p);
+                        _p = __lsx_vfmadd_s(_a3, _S3, _p);
+                        __lsx_vst(_p, outptr, 0);
+
+                        alphap += 4;
+                        outptr += 4;
+                    }
+                }
+
+                delete[] buf;
+            }
+
+            return 0;
+        }
+#endif // __loongarch_sx
+
+        if (resize_type == 1) // nearest
+        {
+            const float ws = output_width ? w / (float)outw : 1.f / width_scale;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int y = 0; y < h; y++)
+            {
+                const float* ptr = bottom_blob.row(y);
+                float* outptr = top_blob.row(y);
+                for (int x = 0; x < outw; x++)
+                {
+                    int in_x = std::min((int)(x * ws), (w - 1));
+                    *outptr++ = ptr[in_x];
+                }
+            }
+        }
+
+        if (resize_type == 2) // bilinear
+        {
+            int* buf = new int[outw + outw * 2];
+
+            int* xofs = buf;
+            float* alpha = (float*)(buf + outw);
+
+            linear_coeffs(w, outw, xofs, alpha, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int y = 0; y < h; y++)
+            {
+                const float* ptr = bottom_blob.row(y);
+                float* outptr = top_blob.row(y);
+                const float* alphap = alpha;
+
+                for (int x = 0; x < outw; x++)
+                {
+                    int sx = xofs[x];
+                    const float* Sp = ptr + sx;
+                    float a0 = alphap[0];
+                    float a1 = alphap[1];
+                    *outptr++ = Sp[0] * a0 + Sp[1] * a1;
+                    alphap += 2;
+                }
+            }
+
+            delete[] buf;
+        }
+
+        if (resize_type == 3) // bicubic
+        {
+            int* buf = new int[outw + outw * 4];
+
+            int* xofs = buf;
+            float* alpha = (float*)(buf + outw);
+
+            cubic_coeffs(w, outw, xofs, alpha, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int y = 0; y < h; y++)
+            {
+                const float* ptr = bottom_blob.row(y);
+                float* outptr = top_blob.row(y);
+                const float* alphap = alpha;
+
+                for (int x = 0; x < outw; x++)
+                {
+                    int sx = xofs[x];
+                    const float* Sp = ptr + sx;
+                    float a0 = alphap[0];
+                    float a1 = alphap[1];
+                    float a2 = alphap[2];
+                    float a3 = alphap[3];
+                    *outptr++ = Sp[-1] * a0 + Sp[0] * a1 + Sp[1] * a2 + Sp[2] * a3;
+                    alphap += 4;
+                }
+            }
+
+            delete[] buf;
+        }
+
+        return 0;
+    }
+
+    if (outw == w && outh == h)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+#if __loongarch_sx
+    if (elempack == 4)
+    {
+        if (resize_type == 1) // nearest
+        {
+            const float hs = output_height ? h / (float)outh : 1.f / height_scale;
+            const float ws = output_width ? w / (float)outw : 1.f / width_scale;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                for (int y = 0; y < outh; y++)
+                {
+                    int in_y = std::min((int)(y * hs), (h - 1));
+
+                    const float* ptr = src.row(in_y);
+                    float* outptr = dst.row(y);
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int in_x = std::min((int)(x * ws), (w - 1));
+
+                        __m128 _p = (__m128)__lsx_vld(ptr + in_x * 4, 0);
+                        __lsx_vst(_p, outptr, 0);
+
+                        outptr += 4;
+                    }
+                }
+            }
+        }
+
+        if (resize_type == 2) // bilinear
+        {
+            int* buf = new int[outw + outh + outw * 2 + outh * 2];
+
+            int* xofs = buf;        //new int[outw];
+            int* yofs = buf + outw; //new int[outh];
+
+            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
+            float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
+
+            linear_coeffs(w, outw, xofs, alpha, align_corner);
+            linear_coeffs(h, outh, yofs, beta, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                resize_bilinear_image_pack4(src, dst, alpha, xofs, beta, yofs);
+            }
+
+            delete[] buf;
+        }
+
+        if (resize_type == 3) // bicubic
+        {
+            int* buf = new int[outw + outh + outw * 4 + outh * 4];
+
+            int* xofs = buf;        //new int[outw];
+            int* yofs = buf + outw; //new int[outh];
+
+            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
+            float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
+
+            cubic_coeffs(w, outw, xofs, alpha, align_corner);
+            cubic_coeffs(h, outh, yofs, beta, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                resize_bicubic_image_pack4(src, dst, alpha, xofs, beta, yofs);
+            }
+
+            delete[] buf;
+        }
+
+        return 0;
+    }
+#endif // __loongarch_sx
+
+    if (resize_type == 1) // nearest
+    {
+        const float hs = output_height ? h / (float)outh : 1.f / height_scale;
+        const float ws = output_width ? w / (float)outw : 1.f / width_scale;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            for (int y = 0; y < outh; y++)
+            {
+                int in_y = std::min((int)(y * hs), (h - 1));
+
+                const float* ptr = src.row(in_y);
+                float* outptr = dst.row(y);
+                for (int x = 0; x < outw; x++)
+                {
+                    int in_x = std::min((int)(x * ws), (w - 1));
+                    *outptr++ = ptr[in_x];
+                }
+            }
+        }
+    }
+
+    if (resize_type == 2) // bilinear
+    {
+        int* buf = new int[outw + outh + outw * 2 + outh * 2];
+
+        int* xofs = buf;        //new int[outw];
+        int* yofs = buf + outw; //new int[outh];
+
+        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
+        float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
+
+        linear_coeffs(w, outw, xofs, alpha, align_corner);
+        linear_coeffs(h, outh, yofs, beta, align_corner);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            resize_bilinear_image(src, dst, alpha, xofs, beta, yofs);
+        }
+
+        delete[] buf;
+    }
+
+    if (resize_type == 3) // bicubic
+    {
+        int* buf = new int[outw + outh + outw * 4 + outh * 4];
+
+        int* xofs = buf;        //new int[outw];
+        int* yofs = buf + outw; //new int[outh];
+
+        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
+        float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
+
+        cubic_coeffs(w, outw, xofs, alpha, align_corner);
+        cubic_coeffs(h, outh, yofs, beta, align_corner);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            resize_bicubic_image(src, dst, alpha, xofs, beta, yofs);
+        }
+
+        delete[] buf;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/interp_loongarch.h b/src/layer/loongarch/interp_loongarch.h
new file mode 100644
index 00000000000..4c0e0f3dc86
--- /dev/null
+++ b/src/layer/loongarch/interp_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_INTERP_LOONGARCH_H
+#define LAYER_INTERP_LOONGARCH_H
+
+#include "interp.h"
+
+namespace ncnn {
+
+class Interp_loongarch : virtual public Interp
+{
+public:
+    Interp_loongarch();
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_INTERP_LOONGARCH_H
diff --git a/src/layer/loongarch/loongarch_activation.h b/src/layer/loongarch/loongarch_activation.h
new file mode 100644
index 00000000000..abb268f4bb6
--- /dev/null
+++ b/src/layer/loongarch/loongarch_activation.h
@@ -0,0 +1,70 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LOONGARCH_ACTIVATION_H
+#define LOONGARCH_ACTIVATION_H
+
+#include "fused_activation.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+
+static inline __m128 activation_ps(__m128 _v, int activation_type, const ncnn::Mat& activation_params)
+{
+    if (activation_type == 1)
+    {
+        __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+        _v = __lsx_vfmax_s(_v, _zero);
+    }
+    else if (activation_type == 2)
+    {
+        __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+        __m128 _slope = (__m128)__lsx_vreplfr2vr_s(activation_params[0]);
+        __m128i _lemask = __lsx_vfcmp_cle_s(_v, _zero);
+        __m128 _ps = __lsx_vfmul_s(_v, _slope);
+        _v = (__m128)__lsx_vbitsel_v((__m128i)_v, (__m128i)_ps, (__m128i)_lemask);
+    }
+    else if (activation_type == 3)
+    {
+        __m128 _min = (__m128)__lsx_vreplfr2vr_s(activation_params[0]);
+        __m128 _max = (__m128)__lsx_vreplfr2vr_s(activation_params[1]);
+        _v = __lsx_vfmax_s(_v, _min);
+        _v = __lsx_vfmin_s(_v, _max);
+    }
+    else if (activation_type == 4)
+    {
+        _v = sigmoid_ps(_v);
+    }
+    else if (activation_type == 5)
+    {
+        _v = __lsx_vfmul_s(_v, tanh_ps(log_ps(__lsx_vfadd_s(exp_ps(_v), (__m128)__lsx_vreplfr2vr_s(1.f)))));
+    }
+    else if (activation_type == 6)
+    {
+        __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(activation_params[0]);
+        __m128 _beta = (__m128)__lsx_vreplfr2vr_s(activation_params[1]);
+        __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
+        __m128 _outp = __lsx_vfmadd_s(_alpha, _v, _beta);
+        _outp = __lsx_vfmax_s(_outp, _zero);
+        _outp = __lsx_vfmin_s(_outp, _one);
+        _v = __lsx_vfmul_s(_outp, _v);
+    }
+
+    return _v;
+}
+#endif // __loongarch_sx
+
+#endif // LOONGARCH_ACTIVATION_H
diff --git a/src/layer/loongarch/loongarch_usability.h b/src/layer/loongarch/loongarch_usability.h
new file mode 100644
index 00000000000..d3ae5dec279
--- /dev/null
+++ b/src/layer/loongarch/loongarch_usability.h
@@ -0,0 +1,236 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LOONGARCH_USABILITY_H
+#define LOONGARCH_USABILITY_H
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include <math.h>
+#include <stdint.h>
+
+namespace ncnn {
+
+typedef union
+{
+    int32_t i;
+    float f;
+} FloatInt;
+
+} // namespace ncnn
+
+#if __loongarch_sx
+/* declare some loongarch constants with union */
+#define _LOONGARCH_FLOAT_CONST(Name, Val) \
+    static const ncnn::FloatInt Name = {.f = Val}
+
+/* float type data load instructions */
+static NCNN_FORCEINLINE __m128 __lsx_vreplfr2vr_s(float val)
+{
+    ncnn::FloatInt fi_tmpval = {.f = val};
+    return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
+}
+
+static NCNN_FORCEINLINE float __lsx_reduce_fadd_s(__m128 _v)
+{
+    // TODO find a more efficient way
+    float* _v_p = (float*)&_v;
+    return _v_p[0] + _v_p[1] + _v_p[2] + _v_p[3];
+}
+
+static NCNN_FORCEINLINE int __lsx_reduce_add_w(__m128i _v)
+{
+    // TODO find a more efficient way
+    int* _v_p = (int*)&_v;
+    return _v_p[0] + _v_p[1] + _v_p[2] + _v_p[3];
+}
+
+#endif // __loongarch_sx
+
+static NCNN_FORCEINLINE signed char float2int8(float v)
+{
+    int int32 = round(v);
+    if (int32 > 127) return 127;
+    if (int32 < -127) return -127;
+    return (signed char)int32;
+}
+
+#if __loongarch_sx
+static NCNN_FORCEINLINE __m128i float2int8(__m128 _v)
+{
+    // simulate round to nearest via +/-0.5
+    __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f);
+    __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31);
+
+    __m128i _sign = __lsx_vand_v((__m128i)_v, _signmask);
+    __m128 _p5s = (__m128)__lsx_vor_v((__m128i)_p5, (__m128i)_sign);
+    __m128 _v5 = __lsx_vfadd_s(_v, _p5s);
+    __m128i _v32 = __lsx_vftintrz_w_s(_v5);
+
+    __m128i _v32_16 = __lsx_vsat_w(_v32, 15);
+    __m128i _v16 = __lsx_vpickev_h(_v32_16, _v32_16);
+    _v16 = __lsx_vmax_h(_v16, __lsx_vreplgr2vr_h(-127));
+    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
+    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);
+
+    return _v8;
+}
+
+static NCNN_FORCEINLINE int64_t float2int8(__m128 _vlow, __m128 _vhigh)
+{
+    // simulate round to nearest via +/-0.5
+    __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f);
+    __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31);
+
+    __m128i _signlow = __lsx_vand_v((__m128i)_vlow, _signmask);
+    __m128i _signhigh = __lsx_vand_v((__m128i)_vhigh, _signmask);
+    __m128 _p5low = (__m128)__lsx_vor_v((__m128i)_p5, _signlow);
+    __m128 _p5high = (__m128)__lsx_vor_v((__m128i)_p5, _signhigh);
+    __m128 _vlow5 = __lsx_vfadd_s(_vlow, _p5low);
+    __m128 _vhigh5 = __lsx_vfadd_s(_vhigh, _p5high);
+    __m128i _vlow32 = __lsx_vftintrz_w_s(_vlow5);
+    __m128i _vhigh32 = __lsx_vftintrz_w_s(_vhigh5);
+
+    __m128i _vlow32_16 = __lsx_vsat_w(_vlow32, 15);
+    __m128i _vhigh32_16 = __lsx_vsat_w(_vhigh32, 15);
+    __m128i _v16 = __lsx_vpickev_h(_vhigh32_16, _vlow32_16);
+    _v16 = __lsx_vmax_h(_v16, __lsx_vreplgr2vr_h(-127));
+    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
+    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);
+
+    return _v8[0];
+}
+
+static NCNN_FORCEINLINE __m128i float2int8relu(__m128 _v)
+{
+    // simulate round to nearest via +/-0.5
+    __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f);
+    __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31);
+
+    __m128i _sign = __lsx_vand_v((__m128i)_v, _signmask);
+    __m128 _p5s = (__m128)__lsx_vor_v((__m128i)_p5, _sign);
+    __m128 _v5 = __lsx_vfadd_s(_v, _p5s);
+    __m128i _v32 = __lsx_vftintrz_w_s(_v5);
+
+    __m128i _v32_16 = __lsx_vsat_w(_v32, 15);
+    __m128i _v16 = __lsx_vpickev_h(_v32_16, _v32_16);
+    _v16 = __lsx_vmaxi_h(_v16, 0);
+    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
+    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);
+
+    return _v8;
+}
+
+static NCNN_FORCEINLINE int64_t float2int8relu(__m128 _vlow, __m128 _vhigh)
+{
+    // simulate round to nearest via +/-0.5
+    __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f);
+    __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31);
+
+    __m128i _signlow = __lsx_vand_v((__m128i)_vlow, _signmask);
+    __m128i _signhigh = __lsx_vand_v((__m128i)_vhigh, _signmask);
+    __m128 _p5low = (__m128)__lsx_vor_v((__m128i)_p5, _signlow);
+    __m128 _p5high = (__m128)__lsx_vor_v((__m128i)_p5, _signhigh);
+    __m128 _vlow5 = __lsx_vfadd_s(_vlow, _p5low);
+    __m128 _vhigh5 = __lsx_vfadd_s(_vhigh, _p5high);
+    __m128i _vlow32 = __lsx_vftintrz_w_s(_vlow5);
+    __m128i _vhigh32 = __lsx_vftintrz_w_s(_vhigh5);
+
+    __m128i _vlow32_16 = __lsx_vsat_w(_vlow32, 15);
+    __m128i _vhigh32_16 = __lsx_vsat_w(_vhigh32, 15);
+    __m128i _v16 = __lsx_vpickev_h(_vhigh32_16, _vlow32_16);
+    _v16 = __lsx_vmaxi_h(_v16, 0);
+    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
+    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);
+
+    return _v8[0];
+}
+
+static NCNN_FORCEINLINE __m128i float2int8leakyrelu(__m128 _v, __m128 _slope)
+{
+    __m128 _v_leaky = __lsx_vfmul_s(_v, _slope);
+
+    // simulate round to nearest via +/-0.5
+    __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f);
+    __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31);
+
+    __m128i _sign = __lsx_vand_v((__m128i)_v, _signmask);
+    __m128 _p5s = (__m128)__lsx_vor_v((__m128i)_p5, _sign);
+    __m128 _v5 = __lsx_vfadd_s(_v, _p5s);
+    __m128i _v32 = __lsx_vftintrz_w_s(_v5);
+
+    __m128i _sign_leaky = __lsx_vand_v((__m128i)_v_leaky, _signmask);
+    __m128 _p5_leaky = (__m128)__lsx_vor_v((__m128i)_p5, _sign_leaky);
+    __m128 _v5_leaky = __lsx_vfadd_s(_v_leaky, _p5_leaky);
+    __m128i _v32_leaky = __lsx_vftintrz_w_s(_v5_leaky);
+
+    __m128i _v32_16 = __lsx_vsat_w(_v32, 15);
+    __m128i _v16 = __lsx_vpickev_h(_v32_16, _v32_16);
+
+    __m128i _v32_16_leaky = __lsx_vsat_w(_v32_leaky, 15);
+    __m128i _v16_leaky = __lsx_vpickev_h(_v32_16_leaky, _v32_16_leaky);
+
+    _v16 = __lsx_vmax_h(_v16, _v16_leaky);
+    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
+    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);
+
+    return _v8;
+}
+
+static NCNN_FORCEINLINE int64_t float2int8leakyrelu(__m128 _vlow, __m128 _vhigh, __m128 _slope)
+{
+    __m128 _vlow_leaky = __lsx_vfmul_s(_vlow, _slope);
+    __m128 _vhigh_leaky = __lsx_vfmul_s(_vhigh, _slope);
+
+    // simulate round to nearest via +/-0.5
+    __m128i _p5 = (__m128i)__lsx_vreplfr2vr_s(0.5f);
+    __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31);
+
+    __m128i _signlow = __lsx_vand_v((__m128i)_vlow, _signmask);
+    __m128i _signhigh = __lsx_vand_v((__m128i)_vhigh, _signmask);
+    __m128 _p5low = (__m128)__lsx_vor_v(_p5, _signlow);
+    __m128 _p5high = (__m128)__lsx_vor_v(_p5, _signhigh);
+    __m128 _vlow5 = __lsx_vfadd_s(_vlow, _p5low);
+    __m128 _vhigh5 = __lsx_vfadd_s(_vhigh, _p5high);
+    __m128i _vlow32 = __lsx_vftintrz_w_s(_vlow5);
+    __m128i _vhigh32 = __lsx_vftintrz_w_s(_vhigh5);
+
+    __m128i _signlow_leaky = __lsx_vand_v((__m128i)_vlow_leaky, _signmask);
+    __m128i _signhigh_leaky = __lsx_vand_v((__m128i)_vhigh_leaky, _signmask);
+    __m128 _p5low_leaky = (__m128)__lsx_vor_v(_p5, _signlow_leaky);
+    __m128 _p5high_leaky = (__m128)__lsx_vor_v(_p5, _signhigh_leaky);
+    __m128 _vlow5_leaky = __lsx_vfadd_s(_vlow_leaky, _p5low_leaky);
+    __m128 _vhigh5_leaky = __lsx_vfadd_s(_vhigh_leaky, _p5high_leaky);
+    __m128i _vlow32_leaky = __lsx_vftintrz_w_s(_vlow5_leaky);
+    __m128i _vhigh32_leaky = __lsx_vftintrz_w_s(_vhigh5_leaky);
+
+    __m128i _vlow32_16 = __lsx_vsat_w(_vlow32, 15);
+    __m128i _vhigh32_16 = __lsx_vsat_w(_vhigh32, 15);
+    __m128i _v16 = __lsx_vpickev_h(_vhigh32_16, _vlow32_16);
+
+    __m128i _vlow32_16_leaky = __lsx_vsat_w(_vlow32_leaky, 15);
+    __m128i _vhigh32_16_leaky = __lsx_vsat_w(_vhigh32_leaky, 15);
+    __m128i _v16_leaky = __lsx_vpickev_h(_vhigh32_16_leaky, _vlow32_16_leaky);
+
+    _v16 = __lsx_vmax_h(_v16, _v16_leaky);
+    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
+    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);
+
+    return _v8[0];
+}
+#endif // __loongarch_sx
+
+#endif // LOONGARCH_USABILITY_H
diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h
new file mode 100644
index 00000000000..ededa596659
--- /dev/null
+++ b/src/layer/loongarch/lsx_mathfun.h
@@ -0,0 +1,258 @@
+/* LOONGARCH implementation of exp
+ *
+ *   Inspired by Intel Approximate Math library, and based on the
+ *   corresponding algorithms of the cephes math library
+ *   Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+ */
+
+/*
+ *  This software is provided 'as-is', without any express or implied
+ *  warranty.  In no event will the authors be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this software must not be misrepresented; you must not
+ *     claim that you wrote the original software. If you use this software
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original software.
+ *  3. This notice may not be removed or altered from any source distribution.
+ *
+ *  (this is the zlib license)
+ */
+
+#ifndef LSX_MATHFUN_H
+#define LSX_MATHFUN_H
+
+#include "loongarch_usability.h"
+
+#include <lsxintrin.h>
+
+_LOONGARCH_FLOAT_CONST(c_1, 1.0f);
+_LOONGARCH_FLOAT_CONST(c_2, 2.0f);
+_LOONGARCH_FLOAT_CONST(c_n1, -1.0f);
+_LOONGARCH_FLOAT_CONST(c_0p5, 0.5f);
+
+#define c_inv_mant_mask ~0x7f800000u
+_LOONGARCH_FLOAT_CONST(c_cephes_SQRTHF, 0.707106781186547524);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p0, 7.0376836292E-2);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p1, -1.1514610310E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p2, 1.1676998740E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p3, -1.2420140846E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p4, +1.4249322787E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p5, -1.6668057665E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p6, +2.0000714765E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p7, -2.4999993993E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_p8, +3.3333331174E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_q1, -2.12194440e-4);
+_LOONGARCH_FLOAT_CONST(c_cephes_log_q2, 0.693359375);
+
+/* natural logarithm computed for 4 simultaneous float
+ *   return NaN for x <= 0
+ */
+static inline __m128 log_ps(__m128 x)
+{
+    __m128 one = (__m128)__lsx_vreplgr2vr_w(c_1.i);
+
+    x = __lsx_vfmax_s(x, (__m128)__lsx_vreplgr2vr_w(0)); /* force flush to zero on denormal values */
+    __m128i invalid_mask = __lsx_vfcmp_cle_s(x, (__m128)__lsx_vreplgr2vr_w(0));
+
+    __m128i ux = (__m128i)(x);
+
+    __m128i emm0 = __lsx_vsrl_w(ux, __lsx_vreplgr2vr_w(23));
+
+    /* keep only the fractional part */
+    ux = __lsx_vand_v(ux, __lsx_vreplgr2vr_w(c_inv_mant_mask));
+    ux = __lsx_vor_v(ux, __lsx_vreplgr2vr_w(c_0p5.i));
+    x = (__m128)(ux);
+
+    emm0 = __lsx_vsub_w(emm0, __lsx_vreplgr2vr_w(0x7f));
+    __m128 e = __lsx_vffint_s_w(emm0);
+
+    e = __lsx_vfadd_s(e, one);
+
+    /* part2:
+     *     if( x < SQRTHF ) {
+     *       e -= 1;
+     *       x = x + x - 1.0;
+     *     } else { x = x - 1.0; }
+     */
+    __m128i mask = __lsx_vfcmp_clt_s((__m128)x, (__m128)__lsx_vreplgr2vr_w(c_cephes_SQRTHF.i));
+    __m128 tmp = (__m128)(__lsx_vand_v((__m128i)(x), (__m128i)mask));
+    x = __lsx_vfsub_s(x, one);
+    e = __lsx_vfsub_s(e, (__m128)(__lsx_vand_v((__m128i)(one), (__m128i)mask)));
+    x = __lsx_vfadd_s(x, tmp);
+
+    __m128 z = __lsx_vfmul_s(x, x);
+
+    __m128 y = (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p0.i);
+
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p1.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p2.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p3.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p4.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p5.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p6.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p7.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p8.i));
+    y = __lsx_vfmul_s(y, x);
+
+    y = __lsx_vfmul_s(y, z);
+
+    tmp = __lsx_vfmul_s(e, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_q1.i));
+    y = __lsx_vfadd_s(y, tmp);
+
+    tmp = __lsx_vfmul_s(z, (__m128)__lsx_vreplgr2vr_w(c_0p5.i));
+    y = __lsx_vfsub_s(y, tmp);
+
+    tmp = __lsx_vfmul_s(e, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_q2.i));
+    x = __lsx_vfadd_s(x, y);
+    x = __lsx_vfadd_s(x, tmp);
+    x = (__m128)(__lsx_vor_v((__m128i)(x), (__m128i)invalid_mask)); // negative arg will be NAN
+    return x;
+}
+
+_LOONGARCH_FLOAT_CONST(c_exp_hi, 88.3762626647949f);
+_LOONGARCH_FLOAT_CONST(c_exp_lo, -88.3762626647949f);
+
+_LOONGARCH_FLOAT_CONST(c_cephes_LOG2EF, 1.44269504088896341);
+_LOONGARCH_FLOAT_CONST(c_cephes_exp_C1, 0.693359375);
+_LOONGARCH_FLOAT_CONST(c_cephes_exp_C2, -2.12194440e-4);
+
+_LOONGARCH_FLOAT_CONST(c_cephes_exp_p0, 1.9875691500E-4);
+_LOONGARCH_FLOAT_CONST(c_cephes_exp_p1, 1.3981999507E-3);
+_LOONGARCH_FLOAT_CONST(c_cephes_exp_p2, 8.3334519073E-3);
+_LOONGARCH_FLOAT_CONST(c_cephes_exp_p3, 4.1665795894E-2);
+_LOONGARCH_FLOAT_CONST(c_cephes_exp_p4, 1.6666665459E-1);
+_LOONGARCH_FLOAT_CONST(c_cephes_exp_p5, 5.0000001201E-1);
+
+/* exp() computed for 4 float at once */
+static inline __m128 exp_ps(__m128 x)
+{
+    __m128 tmp, fx;
+
+    __m128 one = (__m128)__lsx_vreplgr2vr_w(c_1.i);
+    x = __lsx_vfmin_s(x, (__m128)__lsx_vreplgr2vr_w(c_exp_hi.i));
+    x = __lsx_vfmax_s(x, (__m128)__lsx_vreplgr2vr_w(c_exp_lo.i));
+
+    /* express exp(x) as exp(g + n*log(2)) */
+    fx = __lsx_vfmul_s(x, (__m128)__lsx_vreplgr2vr_w(c_cephes_LOG2EF.i));
+    fx = __lsx_vfadd_s(fx, (__m128)__lsx_vreplgr2vr_w(c_0p5.i));
+
+    /* perform a floorf */
+    tmp = __lsx_vffint_s_w(__lsx_vftint_w_s(fx));
+
+    /* if greater, substract 1 */
+    __m128i mask = __lsx_vfcmp_clt_s(fx, tmp);
+    mask = __lsx_vand_v(mask, (__m128i)one);
+
+    fx = __lsx_vfsub_s(tmp, (__m128)mask);
+
+    tmp = __lsx_vfmul_s(fx, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_C1.i));
+    __m128 z = __lsx_vfmul_s(fx, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_C2.i));
+    x = __lsx_vfsub_s(x, tmp);
+    x = __lsx_vfsub_s(x, z);
+
+    __m128 y = (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p0.i);
+
+    z = __lsx_vfmul_s(x, x);
+
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p1.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p2.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p3.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p4.i));
+    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p5.i));
+
+    y = __lsx_vfmul_s(y, z);
+    y = __lsx_vfadd_s(y, x);
+    y = __lsx_vfadd_s(y, one);
+
+    /* build 2^n */
+    __m128i mm;
+    mm = __lsx_vftintrz_w_s(fx);
+    mm = __lsx_vadd_w(mm, __lsx_vreplgr2vr_w(0x7f));
+    mm = __lsx_vsll_w(mm, __lsx_vreplgr2vr_w(23));
+
+    y = __lsx_vfmul_s(y, (__m128)mm);
+    return y;
+}
+
+_LOONGARCH_FLOAT_CONST(c_tanh_tiny, 1e-4f);
+_LOONGARCH_FLOAT_CONST(c_tanh_hi, 9.0f);
+// The monomial coefficients of the numerator polynomial (odd).
+_LOONGARCH_FLOAT_CONST(c_tanh_alpha_1, 4.89352455891786e-3f);
+_LOONGARCH_FLOAT_CONST(c_tanh_alpha_3, 6.37261928875436e-4f);
+_LOONGARCH_FLOAT_CONST(c_tanh_alpha_5, 1.48572235717979e-5f);
+_LOONGARCH_FLOAT_CONST(c_tanh_alpha_7, 5.12229709037114e-8f);
+_LOONGARCH_FLOAT_CONST(c_tanh_alpha_9, -8.60467152213735e-11f);
+_LOONGARCH_FLOAT_CONST(c_tanh_alpha_11, 2.00018790482477e-13f);
+_LOONGARCH_FLOAT_CONST(c_tanh_alpha_13, -2.76076847742355e-16f);
+// The monomial coefficients of the denominator polynomial (even).
+_LOONGARCH_FLOAT_CONST(c_tanh_beta_0, 4.89352518554385e-3f);
+_LOONGARCH_FLOAT_CONST(c_tanh_beta_2, 2.26843463243900e-3f);
+_LOONGARCH_FLOAT_CONST(c_tanh_beta_4, 1.18534705686654e-4f);
+_LOONGARCH_FLOAT_CONST(c_tanh_beta_6, 1.19825839466702e-6f);
+
+/* tanh() computed for 4 float at once */
+static inline __m128 tanh_ps(__m128 x)
+{
+    __m128 x2 = (__m128)__lsx_vbitclri_w((__m128i)x, 31);
+    __m128i tiny_mask = __lsx_vfcmp_clt_s((__m128)x2, (__m128)(__m128)__lsx_vreplgr2vr_w(c_tanh_tiny.i));
+    __m128i sig_mask = __lsx_vreplgr2vr_w(1 << 31);
+    __m128i sig_save = __lsx_vand_v((__m128i)x, sig_mask);
+
+    // clamp the inputs to the range [-9, 9] since anything outside
+    // this range is -/+1.0f in single-precision.
+    x2 = (__m128)__lsx_vbitsel_v((__m128i)x2, (__m128i)__lsx_vreplgr2vr_w(c_tanh_hi.i), (__m128i)__lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(c_tanh_hi.i), (__m128)x2));
+
+    // since the polynomials are odd/even, we need x**2.
+    __m128 z = __lsx_vfmul_s(x2, x2);
+
+    // evaluate the numerator polynomial y.
+    __m128 y = (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_13.i);
+    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_11.i));
+    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_9.i));
+    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_7.i));
+    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_5.i));
+    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_3.i));
+    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_1.i));
+    y = __lsx_vfmul_s(y, x2);
+
+    // evaluate the denominator polynomial w.
+    __m128 w = (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_6.i);
+    w = __lsx_vfmadd_s(z, w, (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_4.i));
+    w = __lsx_vfmadd_s(z, w, (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_2.i));
+    w = __lsx_vfmadd_s(z, w, (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_0.i));
+
+    // divide the numerator by the denominator.
+    y = __lsx_vfdiv_s(y, w);
+
+    // reinstate the sign.
+    y = (__m128)__lsx_vor_v((__m128i)y, sig_save);
+
+    // when the argument is very small in magnitude it's more accurate to just return it.
+    y = (__m128)__lsx_vbitsel_v((__m128i)y, (__m128i)x, (__m128i)tiny_mask);
+
+    return y;
+}
+
+static inline __m128 pow_ps(__m128 a, __m128 b)
+{
+    // pow(x, m) = exp(m * log(x))
+    return exp_ps(__lsx_vfmul_s(b, log_ps(a)));
+}
+
+static inline __m128 sigmoid_ps(__m128 _v)
+{
+    __m128 _one = __lsx_vreplfr2vr_s(1.f);
+    _v = (__m128)__lsx_vbitrevi_w((__m128i)_v, 31);
+    _v = exp_ps(_v);
+    _v = __lsx_vfadd_s(_v, _one);
+    return __lsx_vfdiv_s(_one, _v);
+}
+
+#endif // LSX_MATHFUN_H
diff --git a/src/layer/loongarch/mish_loongarch.cpp b/src/layer/loongarch/mish_loongarch.cpp
new file mode 100644
index 00000000000..8558e2f8cb0
--- /dev/null
+++ b/src/layer/loongarch/mish_loongarch.cpp
@@ -0,0 +1,70 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "mish_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+#include <math.h>
+
+namespace ncnn {
+
+Mish_loongarch::Mish_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int Mish_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = __lsx_vfmul_s(_p, tanh_ps(log_ps(__lsx_vfadd_s(exp_ps(_p), _one))));
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = *ptr * tanh(log(exp(*ptr) + 1.f));
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/mish_loongarch.h b/src/layer/loongarch/mish_loongarch.h
new file mode 100644
index 00000000000..97c6f0520f5
--- /dev/null
+++ b/src/layer/loongarch/mish_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_MISH_LOONGARCH_H
+#define LAYER_MISH_LOONGARCH_H
+
+#include "mish.h"
+
+namespace ncnn {
+
+class Mish_loongarch : virtual public Mish
+{
+public:
+    Mish_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_MISH_LOONGARCH_H
diff --git a/src/layer/loongarch/packing_loongarch.cpp b/src/layer/loongarch/packing_loongarch.cpp
new file mode 100644
index 00000000000..cf68b7b34d6
--- /dev/null
+++ b/src/layer/loongarch/packing_loongarch.cpp
@@ -0,0 +1,569 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "packing_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+Packing_loongarch::Packing_loongarch()
+{
+    support_packing = true;
+}
+
+int Packing_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int elembits = bottom_blob.elembits();
+
+    if (elembits == 8)
+        return forward_int8(bottom_blob, top_blob, opt);
+
+    if (use_padding)
+    {
+        return Packing::forward(bottom_blob, top_blob, opt);
+    }
+
+    if (elembits != 32)
+    {
+        // non-fp32 type
+        return Packing::forward(bottom_blob, top_blob, opt);
+    }
+
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    if (elempack == out_elempack)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    bool pack1to4 = elempack == 1 && out_elempack == 4;
+    bool pack4to1 = elempack == 4 && out_elempack == 1;
+
+    if (!pack1to4 && !pack4to1)
+    {
+        return Packing::forward(bottom_blob, top_blob, opt);
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+
+    if (!use_padding)
+    {
+        // identity if use_padding not allowed
+        if (dims == 1 && w * elempack % out_elempack != 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+        if (dims == 2 && h * elempack % out_elempack != 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+        if ((dims == 3 || dims == 4) && channels * elempack % out_elempack != 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+    }
+
+    if (dims == 1)
+    {
+        top_blob = bottom_blob;
+        top_blob.w = w * elempack / out_elempack;
+        top_blob.cstep = w * elempack / out_elempack;
+        top_blob.elemsize = elemsize / elempack * out_elempack;
+        top_blob.elempack = out_elempack;
+        return 0;
+    }
+
+    if (dims == 2)
+    {
+        int outh = h * elempack / out_elempack;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (pack1to4)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < outh; i++)
+            {
+                const float* r0 = bottom_blob.row(i * 4);
+                const float* r1 = bottom_blob.row(i * 4 + 1);
+                const float* r2 = bottom_blob.row(i * 4 + 2);
+                const float* r3 = bottom_blob.row(i * 4 + 3);
+
+                float* outptr = top_blob.row(i);
+
+                int j = 0;
+#if __loongarch_sx
+                for (; j + 3 < w; j += 4)
+                {
+                    // transpose 4x4
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128i _r3 = __lsx_vld(r3, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                    __lsx_vst(_r0123_0, outptr, 0);
+                    __lsx_vst(_r0123_1, outptr + 4, 0);
+                    __lsx_vst(_r0123_2, outptr + 4 * 2, 0);
+                    __lsx_vst(_r0123_3, outptr + 4 * 3, 0);
+
+                    r0 += 4;
+                    r1 += 4;
+                    r2 += 4;
+                    r3 += 4;
+                    outptr += 16;
+                }
+#endif // __loongarch_sx
+                for (; j < w; j++)
+                {
+                    outptr[0] = *r0++;
+                    outptr[1] = *r1++;
+                    outptr[2] = *r2++;
+                    outptr[3] = *r3++;
+
+                    outptr += 4;
+                }
+            }
+        }
+        if (pack4to1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const float* r0 = bottom_blob.row(i);
+
+                float* outptr0 = top_blob.row(i * 4);
+                float* outptr1 = top_blob.row(i * 4 + 1);
+                float* outptr2 = top_blob.row(i * 4 + 2);
+                float* outptr3 = top_blob.row(i * 4 + 3);
+
+                int j = 0;
+#if __loongarch_sx
+                for (; j + 3 < w; j += 4)
+                {
+                    // transpose 4x4
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r1 = __lsx_vld(r0 + 4, 0);
+                    __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                    __lsx_vst(_r0123_0, outptr0, 0);
+                    __lsx_vst(_r0123_1, outptr1, 0);
+                    __lsx_vst(_r0123_2, outptr2, 0);
+                    __lsx_vst(_r0123_3, outptr3, 0);
+
+                    r0 += 16;
+                    outptr0 += 4;
+                    outptr1 += 4;
+                    outptr2 += 4;
+                    outptr3 += 4;
+                }
+#endif // __loongarch_sx
+                for (; j < w; j++)
+                {
+                    *outptr0++ = r0[0];
+                    *outptr1++ = r0[1];
+                    *outptr2++ = r0[2];
+                    *outptr3++ = r0[3];
+
+                    r0 += 4;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (dims == 3 || dims == 4)
+    {
+        int size = w * h * d;
+        int outc = channels * elempack / out_elempack;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        if (dims == 3)
+            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
+        else // if (dims == 4)
+            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (pack1to4)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const float* r0 = bottom_blob.channel(q * 4);
+                const float* r1 = bottom_blob.channel(q * 4 + 1);
+                const float* r2 = bottom_blob.channel(q * 4 + 2);
+                const float* r3 = bottom_blob.channel(q * 4 + 3);
+
+                float* outptr = top_blob.channel(q);
+
+                int i = 0;
+#if __loongarch_sx
+                for (; i + 3 < size; i += 4)
+                {
+                    // transpose 4x4
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r1 = __lsx_vld(r1, 0);
+                    __m128i _r2 = __lsx_vld(r2, 0);
+                    __m128i _r3 = __lsx_vld(r3, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                    __lsx_vst(_r0123_0, outptr, 0);
+                    __lsx_vst(_r0123_1, outptr + 4, 0);
+                    __lsx_vst(_r0123_2, outptr + 4 * 2, 0);
+                    __lsx_vst(_r0123_3, outptr + 4 * 3, 0);
+
+                    r0 += 4;
+                    r1 += 4;
+                    r2 += 4;
+                    r3 += 4;
+                    outptr += 16;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    outptr[0] = *r0++;
+                    outptr[1] = *r1++;
+                    outptr[2] = *r2++;
+                    outptr[3] = *r3++;
+
+                    outptr += 4;
+                }
+            }
+        }
+        if (pack4to1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const float* r0 = bottom_blob.channel(q);
+
+                float* outptr0 = top_blob.channel(q * 4);
+                float* outptr1 = top_blob.channel(q * 4 + 1);
+                float* outptr2 = top_blob.channel(q * 4 + 2);
+                float* outptr3 = top_blob.channel(q * 4 + 3);
+
+                int i = 0;
+#if __loongarch_sx
+                for (; i + 3 < size; i += 4)
+                {
+                    // transpose 4x4
+                    __m128i _r0 = __lsx_vld(r0, 0);
+                    __m128i _r1 = __lsx_vld(r0 + 4, 0);
+                    __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0);
+                    __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0);
+
+                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
+                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
+                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
+                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
+                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
+                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
+                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
+                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
+
+                    __lsx_vst(_r0123_0, outptr0, 0);
+                    __lsx_vst(_r0123_1, outptr1, 0);
+                    __lsx_vst(_r0123_2, outptr2, 0);
+                    __lsx_vst(_r0123_3, outptr3, 0);
+
+                    r0 += 16;
+                    outptr0 += 4;
+                    outptr1 += 4;
+                    outptr2 += 4;
+                    outptr3 += 4;
+                }
+#endif // __loongarch_sx
+                for (; i < size; i++)
+                {
+                    *outptr0++ = r0[0];
+                    *outptr1++ = r0[1];
+                    *outptr2++ = r0[2];
+                    *outptr3++ = r0[3];
+
+                    r0 += 4;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    return 0;
+}
+
+int Packing_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    if (use_padding)
+    {
+        return Packing::forward(bottom_blob, top_blob, opt);
+    }
+
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    if (elempack == out_elempack)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    bool pack1to8 = elempack == 1 && out_elempack == 8;
+    bool pack8to1 = elempack == 8 && out_elempack == 1;
+
+    if (!pack1to8 && !pack8to1)
+    {
+        return Packing::forward(bottom_blob, top_blob, opt);
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+
+    if (!use_padding)
+    {
+        // identity if use_padding not allowed
+        if (dims == 1 && w * elempack % out_elempack != 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+        if (dims == 2 && h * elempack % out_elempack != 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+        if ((dims == 3 || dims == 4) && channels * elempack % out_elempack != 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+    }
+
+    if (dims == 1)
+    {
+        top_blob = bottom_blob;
+        top_blob.w = w * elempack / out_elempack;
+        top_blob.cstep = w * elempack / out_elempack;
+        top_blob.elemsize = elemsize / elempack * out_elempack;
+        top_blob.elempack = out_elempack;
+        return 0;
+    }
+
+    if (dims == 2)
+    {
+        int outh = h * elempack / out_elempack;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (pack1to8)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < outh; i++)
+            {
+                const signed char* r0 = bottom_blob.row<const signed char>(i * 8);
+                const signed char* r1 = bottom_blob.row<const signed char>(i * 8 + 1);
+                const signed char* r2 = bottom_blob.row<const signed char>(i * 8 + 2);
+                const signed char* r3 = bottom_blob.row<const signed char>(i * 8 + 3);
+                const signed char* r4 = bottom_blob.row<const signed char>(i * 8 + 4);
+                const signed char* r5 = bottom_blob.row<const signed char>(i * 8 + 5);
+                const signed char* r6 = bottom_blob.row<const signed char>(i * 8 + 6);
+                const signed char* r7 = bottom_blob.row<const signed char>(i * 8 + 7);
+
+                signed char* outptr = top_blob.row<signed char>(i);
+
+                int j = 0;
+                for (; j < w; j++)
+                {
+                    outptr[0] = *r0++;
+                    outptr[1] = *r1++;
+                    outptr[2] = *r2++;
+                    outptr[3] = *r3++;
+                    outptr[4] = *r4++;
+                    outptr[5] = *r5++;
+                    outptr[6] = *r6++;
+                    outptr[7] = *r7++;
+
+                    outptr += 8;
+                }
+            }
+        }
+        if (pack8to1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const signed char* r0 = bottom_blob.row<const signed char>(i);
+
+                signed char* outptr0 = top_blob.row<signed char>(i * 8);
+                signed char* outptr1 = top_blob.row<signed char>(i * 8 + 1);
+                signed char* outptr2 = top_blob.row<signed char>(i * 8 + 2);
+                signed char* outptr3 = top_blob.row<signed char>(i * 8 + 3);
+                signed char* outptr4 = top_blob.row<signed char>(i * 8 + 4);
+                signed char* outptr5 = top_blob.row<signed char>(i * 8 + 5);
+                signed char* outptr6 = top_blob.row<signed char>(i * 8 + 6);
+                signed char* outptr7 = top_blob.row<signed char>(i * 8 + 7);
+
+                int j = 0;
+                for (; j < w; j++)
+                {
+                    *outptr0++ = r0[0];
+                    *outptr1++ = r0[1];
+                    *outptr2++ = r0[2];
+                    *outptr3++ = r0[3];
+                    *outptr4++ = r0[4];
+                    *outptr5++ = r0[5];
+                    *outptr6++ = r0[6];
+                    *outptr7++ = r0[7];
+
+                    r0 += 8;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (dims == 3 || dims == 4)
+    {
+        int size = w * h * d;
+        int outc = channels * elempack / out_elempack;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        if (dims == 3)
+            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
+        else // if (dims == 4)
+            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (pack1to8)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const signed char* r0 = bottom_blob.channel(q * 8);
+                const signed char* r1 = bottom_blob.channel(q * 8 + 1);
+                const signed char* r2 = bottom_blob.channel(q * 8 + 2);
+                const signed char* r3 = bottom_blob.channel(q * 8 + 3);
+                const signed char* r4 = bottom_blob.channel(q * 8 + 4);
+                const signed char* r5 = bottom_blob.channel(q * 8 + 5);
+                const signed char* r6 = bottom_blob.channel(q * 8 + 6);
+                const signed char* r7 = bottom_blob.channel(q * 8 + 7);
+
+                signed char* outptr = top_blob.channel(q);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    outptr[0] = *r0++;
+                    outptr[1] = *r1++;
+                    outptr[2] = *r2++;
+                    outptr[3] = *r3++;
+                    outptr[4] = *r4++;
+                    outptr[5] = *r5++;
+                    outptr[6] = *r6++;
+                    outptr[7] = *r7++;
+
+                    outptr += 8;
+                }
+            }
+        }
+        if (pack8to1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const signed char* r0 = bottom_blob.channel(q);
+
+                signed char* outptr0 = top_blob.channel(q * 8);
+                signed char* outptr1 = top_blob.channel(q * 8 + 1);
+                signed char* outptr2 = top_blob.channel(q * 8 + 2);
+                signed char* outptr3 = top_blob.channel(q * 8 + 3);
+                signed char* outptr4 = top_blob.channel(q * 8 + 4);
+                signed char* outptr5 = top_blob.channel(q * 8 + 5);
+                signed char* outptr6 = top_blob.channel(q * 8 + 6);
+                signed char* outptr7 = top_blob.channel(q * 8 + 7);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    *outptr0++ = r0[0];
+                    *outptr1++ = r0[1];
+                    *outptr2++ = r0[2];
+                    *outptr3++ = r0[3];
+                    *outptr4++ = r0[4];
+                    *outptr5++ = r0[5];
+                    *outptr6++ = r0[6];
+                    *outptr7++ = r0[7];
+
+                    r0 += 8;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/packing_loongarch.h b/src/layer/loongarch/packing_loongarch.h
new file mode 100644
index 00000000000..1db215cfee7
--- /dev/null
+++ b/src/layer/loongarch/packing_loongarch.h
@@ -0,0 +1,35 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_PACKING_LOONGARCH_H
+#define LAYER_PACKING_LOONGARCH_H
+
+#include "packing.h"
+
+namespace ncnn {
+
+class Packing_loongarch : virtual public Packing
+{
+public:
+    Packing_loongarch();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+protected:
+    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_PACKING_LOONGARCH_H
diff --git a/src/layer/loongarch/padding_loongarch.cpp b/src/layer/loongarch/padding_loongarch.cpp
new file mode 100644
index 00000000000..1f345ce6053
--- /dev/null
+++ b/src/layer/loongarch/padding_loongarch.cpp
@@ -0,0 +1,385 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "padding_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+#if __loongarch_sx
+#include "padding_pack4.h"
+#include "padding_pack8_int8.h"
+#endif // __loongarch_sx
+
+Padding_loongarch::Padding_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Padding_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    if (top == 0 && bottom == 0 && left == 0 && right == 0 && front == 0 && behind == 0)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    int elembits = bottom_blob.elembits();
+
+    if (elembits == 8)
+        return forward_int8(bottom_blob, top_blob, opt);
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+#if __loongarch_sx
+    if (elempack == 4)
+    {
+        if (dims == 1)
+        {
+            int outw = w * elempack + left + right;
+
+            int out_elempack = outw % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (left % 4 == 0 && out_elempack == 4 && type == 0)
+            {
+                top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                __m128 pad_value = __lsx_vreplfr2vr_s(value);
+                padding_constant_pack4_lsx(bottom_blob, top_blob, 0, 0, left / 4, right / 4, pad_value);
+
+                return 0;
+            }
+        }
+
+        if (dims == 2)
+        {
+            int outw = w + left + right;
+            int outh = h * elempack + top + bottom;
+
+            int out_elempack = outh % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (top % 4 == 0 && out_elempack == 4 && type == 0)
+            {
+                top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                __m128 pad_value = __lsx_vreplfr2vr_s(value);
+                padding_constant_pack4_lsx(bottom_blob, top_blob, top / 4, bottom / 4, left, right, pad_value);
+
+                return 0;
+            }
+        }
+
+        if (dims == 3)
+        {
+            int outw = w + left + right;
+            int outh = h + top + bottom;
+            int outc = channels * elempack + front + behind;
+
+            int out_elempack = outc % 4 == 0 ? 4 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (front % 4 == 0 && out_elempack == 4 && !(outc != channels * elempack && type != 0))
+            {
+                top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                int front_ = front / elempack;
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < outc / out_elempack; q++)
+                {
+                    Mat borderm = top_blob.channel(q);
+
+                    __m128 pad_value = per_channel_pad_data_size ? (__m128)__lsx_vld((const float*)per_channel_pad_data + q * 4, 0) : __lsx_vreplfr2vr_s(value);
+                    //Channel padding
+                    if ((q - front_) < 0 || (q - front_) >= channels)
+                    {
+                        borderm.fill(pad_value);
+                    }
+                    else
+                    {
+                        const Mat m = bottom_blob.channel(q - front_);
+                        if (type == 0)
+                            padding_constant_pack4_lsx(m, borderm, top, bottom, left, right, pad_value);
+                        if (type == 1)
+                            padding_replicate_pack4_lsx(m, borderm, top, bottom, left, right);
+                        if (type == 2)
+                            padding_reflect_pack4_lsx(m, borderm, top, bottom, left, right);
+                    }
+                }
+
+                return 0;
+            }
+        }
+
+        if (dims == 4)
+        {
+            int outw = w + left + right;
+            int outh = h + top + bottom;
+            int outd = d + front + behind;
+
+            if (type == 0)
+            {
+                top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    __m128 pad_value = per_channel_pad_data_size ? (__m128)__lsx_vld((const float*)per_channel_pad_data + q * 4, 0) : __lsx_vreplfr2vr_s(value);
+
+                    for (int z = 0; z < outd; z++)
+                    {
+                        Mat borderm = top_blob.channel(q).depth(z);
+
+                        // depth padding
+                        if ((z - front) < 0 || (z - front) >= d)
+                        {
+                            borderm.fill(pad_value);
+                        }
+                        else
+                        {
+                            const Mat m = bottom_blob.channel(q).depth(z - front);
+                            padding_constant_pack4_lsx(m, borderm, top, bottom, left, right, pad_value);
+                        }
+                    }
+                }
+
+                return 0;
+            }
+        }
+    }
+#endif // __loongarch_sx
+
+    Mat bottom_blob_unpacked = bottom_blob;
+    if (elempack != 1)
+    {
+        Option opt_pack1 = opt;
+        opt_pack1.blob_allocator = opt.workspace_allocator;
+
+        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
+    }
+
+    Mat top_blob_unpacked;
+    int ret = Padding::forward(bottom_blob_unpacked, top_blob_unpacked, opt);
+    if (ret != 0)
+        return ret;
+
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = top_blob_unpacked.c % 4 == 0 ? 4 : 1;
+    }
+#endif
+
+    convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+
+    return 0;
+}
+
+int Padding_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int d = bottom_blob.d;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+#if __loongarch_sx
+    if (elempack == 8)
+    {
+        if (dims == 1)
+        {
+            int outw = w * elempack + left + right;
+
+            int out_elempack = outw % 8 == 0 ? 8 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (left % 8 == 0 && out_elempack == 8 && type == 0)
+            {
+                top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                int64_t v8 = (int64_t)value;
+                int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
+                padding_constant_pack8_int8_lsx(bottom_blob, top_blob, 0, 0, left / 8, right / 8, pad_value);
+
+                return 0;
+            }
+        }
+
+        if (dims == 2)
+        {
+            int outw = w + left + right;
+            int outh = h * elempack + top + bottom;
+
+            int out_elempack = outh % 8 == 0 ? 8 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (top % 8 == 0 && out_elempack == 8 && type == 0)
+            {
+                top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                int64_t v8 = (int64_t)value;
+                int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
+                padding_constant_pack8_int8_lsx(bottom_blob, top_blob, top / 8, bottom / 8, left, right, pad_value);
+
+                return 0;
+            }
+        }
+
+        if (dims == 3)
+        {
+            int outw = w + left + right;
+            int outh = h + top + bottom;
+            int outc = channels * elempack + front + behind;
+
+            int out_elempack = outc % 8 == 0 ? 8 : 1;
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            if (front % 8 == 0 && out_elempack == 8 && !(outc != channels * elempack && type != 0))
+            {
+                top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                int front_ = front / elempack;
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < outc / out_elempack; q++)
+                {
+                    Mat borderm = top_blob.channel(q);
+
+                    // TODO perchannel
+                    //                     int64_t pad_value = per_channel_pad_data_size ? vld1_s8(per_channel_pad_data + q * 8) : vdup_n_s8((signed char)value);
+                    int64_t v8 = (int64_t)value;
+                    int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
+
+                    //Channel padding
+                    if ((q - front_) < 0 || (q - front_) >= channels)
+                    {
+                        borderm.fill(pad_value);
+                    }
+                    else
+                    {
+                        const Mat m = bottom_blob.channel(q - front_);
+                        if (type == 0)
+                            padding_constant_pack8_int8_lsx(m, borderm, top, bottom, left, right, pad_value);
+                        if (type == 1)
+                            padding_replicate_pack8_int8_lsx(m, borderm, top, bottom, left, right);
+                        if (type == 2)
+                            padding_reflect_pack8_int8_lsx(m, borderm, top, bottom, left, right);
+                    }
+                }
+
+                return 0;
+            }
+        }
+
+        if (dims == 4)
+        {
+            int outw = w + left + right;
+            int outh = h + top + bottom;
+            int outd = d + front + behind;
+
+            if (type == 0)
+            {
+                top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator);
+                if (top_blob.empty())
+                    return -100;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    // TODO perchannel
+                    //                     int64_t pad_value = per_channel_pad_data_size ? vld1_s8(per_channel_pad_data + q * 8) : vdup_n_s8((signed char)value);
+                    int64_t v8 = (int64_t)value;
+                    int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
+
+                    for (int z = 0; z < outd; z++)
+                    {
+                        Mat borderm = top_blob.channel(q).depth(z);
+
+                        // depth padding
+                        if ((z - front) < 0 || (z - front) >= d)
+                        {
+                            borderm.fill(pad_value);
+                        }
+                        else
+                        {
+                            const Mat m = bottom_blob.channel(q).depth(z - front);
+                            padding_constant_pack8_int8_lsx(m, borderm, top, bottom, left, right, pad_value);
+                        }
+                    }
+                }
+
+                return 0;
+            }
+        }
+    }
+#endif // __loongarch_sx
+
+    Mat bottom_blob_unpacked = bottom_blob;
+    if (elempack != 1)
+    {
+        Option opt_pack1 = opt;
+        opt_pack1.blob_allocator = opt.workspace_allocator;
+
+        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
+    }
+
+    Mat top_blob_unpacked;
+    int ret = Padding::forward(bottom_blob_unpacked, top_blob_unpacked, opt);
+    if (ret != 0)
+        return ret;
+
+    int out_elempack = 1;
+#if __loongarch_sx
+    if (opt.use_packing_layout)
+    {
+        out_elempack = top_blob_unpacked.c % 8 == 0 ? 8 : 1;
+    }
+#endif
+
+    convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/padding_loongarch.h b/src/layer/loongarch/padding_loongarch.h
new file mode 100644
index 00000000000..137fbc4459e
--- /dev/null
+++ b/src/layer/loongarch/padding_loongarch.h
@@ -0,0 +1,35 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_PADDING_LOONGARCH_H
+#define LAYER_PADDING_LOONGARCH_H
+
+#include "padding.h"
+
+namespace ncnn {
+
+class Padding_loongarch : virtual public Padding
+{
+public:
+    Padding_loongarch();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+protected:
+    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_PADDING_LOONGARCH_H
diff --git a/src/layer/loongarch/padding_pack4.h b/src/layer/loongarch/padding_pack4.h
new file mode 100644
index 00000000000..d040ce778b5
--- /dev/null
+++ b/src/layer/loongarch/padding_pack4.h
@@ -0,0 +1,213 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void padding_constant_pack4_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right, __m128 v)
+{
+    const float* ptr = src;
+    float* outptr = dst;
+    int top_size = top * dst.w;
+    int bottom_size = bottom * dst.w;
+
+    // fill top
+    for (int y = 0; y < top_size; y++)
+    {
+        __lsx_vst(v, outptr, 0);
+        outptr += 4;
+    }
+    // fill center
+    for (int y = 0; y < src.h; y++)
+    {
+        for (int x = 0; x < left; x++)
+        {
+            __lsx_vst(v, outptr, 0);
+            outptr += 4;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            __builtin_prefetch(ptr + 32);
+            __lsx_vst(__lsx_vld(ptr, 0), outptr, 0);
+            ptr += 4;
+            outptr += 4;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            __lsx_vst(v, outptr, 0);
+            outptr += 4;
+        }
+    }
+    // fill top
+    for (int y = 0; y < bottom_size; y++)
+    {
+        __lsx_vst(v, outptr, 0);
+        outptr += 4;
+    }
+}
+
+static void padding_replicate_pack4_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
+{
+    const float* ptr = src;
+    float* outptr = dst;
+
+    // fill top
+    for (int y = 0; y < top; y++)
+    {
+        const float* ptr0 = ptr;
+        __m128 _p = (__m128)__lsx_vld(ptr0, 0);
+        for (int x = 0; x < left; x++)
+        {
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            _p = (__m128)__lsx_vld(ptr0, 0);
+            __lsx_vst(_p, outptr, 0);
+            ptr0 += 4;
+            outptr += 4;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+    }
+    // fill center
+    for (int y = 0; y < src.h; y++)
+    {
+        __m128 _p = (__m128)__lsx_vld(ptr, 0);
+        for (int x = 0; x < left; x++)
+        {
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            _p = (__m128)__lsx_vld(ptr, 0);
+            __lsx_vst(_p, outptr, 0);
+            ptr += 4;
+            outptr += 4;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+    }
+    // fill bottom
+    ptr -= src.w * 4;
+    for (int y = 0; y < bottom; y++)
+    {
+        const float* ptr0 = ptr;
+        __m128 _p = (__m128)__lsx_vld(ptr0, 0);
+        for (int x = 0; x < left; x++)
+        {
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            _p = (__m128)__lsx_vld(ptr0, 0);
+            __lsx_vst(_p, outptr, 0);
+            ptr0 += 4;
+            outptr += 4;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+    }
+}
+
+static void padding_reflect_pack4_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
+{
+    const float* ptr = src;
+    float* outptr = dst;
+
+    // fill top
+    ptr += top * src.w * 4;
+    for (int y = 0; y < top; y++)
+    {
+        const float* ptr0 = ptr;
+        for (int x = 0; x < left; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr0 + (left - x) * 4, 0);
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr0, 0);
+            __lsx_vst(_p, outptr, 0);
+            ptr0 += 4;
+            outptr += 4;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr0 - 8 - x * 4, 0);
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+        ptr -= src.w * 4;
+    }
+    // fill center
+    for (int y = 0; y < src.h; y++)
+    {
+        for (int x = 0; x < left; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr + (left - x) * 4, 0);
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __lsx_vst(_p, outptr, 0);
+            ptr += 4;
+            outptr += 4;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr - 8 - x * 4, 0);
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+    }
+    // fill bottom
+    ptr -= 2 * src.w * 4;
+    for (int y = 0; y < bottom; y++)
+    {
+        const float* ptr0 = ptr;
+        for (int x = 0; x < left; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr0 + (left - x) * 4, 0);
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr0, 0);
+            __lsx_vst(_p, outptr, 0);
+            ptr0 += 4;
+            outptr += 4;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr0 - 8 - x * 4, 0);
+            __lsx_vst(_p, outptr, 0);
+            outptr += 4;
+        }
+        ptr -= src.w * 4;
+    }
+}
diff --git a/src/layer/loongarch/padding_pack8_int8.h b/src/layer/loongarch/padding_pack8_int8.h
new file mode 100644
index 00000000000..4c6586c6ae2
--- /dev/null
+++ b/src/layer/loongarch/padding_pack8_int8.h
@@ -0,0 +1,171 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void padding_constant_pack8_int8_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int64_t _v)
+{
+    const int64_t* ptr = src;
+    int64_t* outptr = dst;
+
+    // fill top
+    for (int y = 0; y < top; y++)
+    {
+        for (int x = 0; x < dst.w; x++)
+        {
+            *outptr++ = _v;
+        }
+    }
+    // fill center
+    for (int y = 0; y < src.h; y++)
+    {
+        for (int x = 0; x < left; x++)
+        {
+            *outptr++ = _v;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            *outptr++ = *ptr++;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            *outptr++ = _v;
+        }
+    }
+    // fill bottom
+    for (int y = 0; y < bottom; y++)
+    {
+        for (int x = 0; x < dst.w; x++)
+        {
+            *outptr++ = _v;
+        }
+    }
+}
+
+static void padding_replicate_pack8_int8_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
+{
+    const int64_t* ptr = src;
+    int64_t* outptr = dst;
+
+    // fill top
+    for (int y = 0; y < top; y++)
+    {
+        const int64_t* ptr0 = ptr;
+        for (int x = 0; x < left; x++)
+        {
+            *outptr++ = *ptr0;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            *outptr++ = *ptr0++;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            *outptr++ = ptr0[-1];
+        }
+    }
+    // fill center
+    for (int y = 0; y < src.h; y++)
+    {
+        for (int x = 0; x < left; x++)
+        {
+            *outptr++ = *ptr;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            *outptr++ = *ptr++;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            *outptr++ = ptr[-1];
+        }
+    }
+    // fill bottom
+    ptr -= src.w;
+    for (int y = 0; y < bottom; y++)
+    {
+        const int64_t* ptr0 = ptr;
+        for (int x = 0; x < left; x++)
+        {
+            *outptr++ = *ptr0;
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            *outptr++ = *ptr0++;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            *outptr++ = ptr0[-1];
+        }
+    }
+}
+
+static void padding_reflect_pack8_int8_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
+{
+    const int64_t* ptr = src;
+    int64_t* outptr = dst;
+
+    // fill top
+    ptr += top * src.w;
+    for (int y = 0; y < top; y++)
+    {
+        const int64_t* ptr0 = ptr;
+        for (int x = 0; x < left; x++)
+        {
+            *outptr++ = ptr0[left - x];
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            *outptr++ = *ptr0++;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            *outptr++ = ptr0[-2 - x];
+        }
+        ptr -= src.w;
+    }
+    // fill center
+    for (int y = 0; y < src.h; y++)
+    {
+        for (int x = 0; x < left; x++)
+        {
+            *outptr++ = ptr[left - x];
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            *outptr++ = *ptr++;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            *outptr++ = ptr[-2 - x];
+        }
+    }
+    // fill bottom
+    ptr -= 2 * src.w;
+    for (int y = 0; y < bottom; y++)
+    {
+        const int64_t* ptr0 = ptr;
+        for (int x = 0; x < left; x++)
+        {
+            *outptr++ = ptr0[left - x];
+        }
+        for (int x = 0; x < src.w; x++)
+        {
+            *outptr++ = *ptr0++;
+        }
+        for (int x = 0; x < right; x++)
+        {
+            *outptr++ = ptr0[-2 - x];
+        }
+        ptr -= src.w;
+    }
+}
diff --git a/src/layer/loongarch/pooling_loongarch.cpp b/src/layer/loongarch/pooling_loongarch.cpp
new file mode 100644
index 00000000000..9d988971324
--- /dev/null
+++ b/src/layer/loongarch/pooling_loongarch.cpp
@@ -0,0 +1,291 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pooling_loongarch.h"
+
+#include <float.h>
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+Pooling_loongarch::Pooling_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Pooling_loongarch::create_pipeline(const Option& /*opt*/)
+{
+    if (adaptive_pooling)
+    {
+        support_packing = false;
+
+        support_bf16_storage = false;
+        support_fp16_storage = false;
+        support_int8_storage = false;
+        support_tensor_storage = false;
+    }
+    return 0;
+}
+
+int Pooling_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    if (adaptive_pooling)
+    {
+        return Pooling::forward(bottom_blob, top_blob, opt);
+    }
+
+    // max value in NxN window
+    // avg value in NxN window
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+#if __loongarch_sx
+    //     NCNN_LOGE("Pooling     input %d x %d  pad = %d %d %d %d  ksize=%d %d  stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);
+
+    if (elempack == 4)
+    {
+        if (global_pooling)
+        {
+            top_blob.create(channels, elemsize, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            int size = w * h;
+
+            if (pooling_type == PoolMethod_MAX)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const float* ptr = bottom_blob.channel(q);
+
+                    __m128 _max = (__m128)__lsx_vld(ptr, 0);
+                    for (int i = 0; i < size; i++)
+                    {
+                        __m128 _val = (__m128)__lsx_vld(ptr, 0);
+                        _max = __lsx_vfmax_s(_max, _val);
+                        ptr += 4;
+                    }
+
+                    float* outptr = top_blob;
+                    __lsx_vst(_max, outptr + q * 4, 0);
+                }
+            }
+            else if (pooling_type == PoolMethod_AVE)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const float* ptr = bottom_blob.channel(q);
+
+                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+                    for (int i = 0; i < size; i++)
+                    {
+                        __m128 _val = (__m128)__lsx_vld(ptr, 0);
+                        _sum = __lsx_vfadd_s(_sum, _val);
+                        ptr += 4;
+                    }
+
+                    __m128 _avg = __lsx_vfmul_s(_sum, __lsx_vreplfr2vr_s(1.f / size));
+
+                    float* outptr = top_blob;
+                    __lsx_vst(_avg, outptr + q * 4, 0);
+                }
+            }
+
+            return 0;
+        }
+
+        Mat bottom_blob_bordered;
+        make_padding(bottom_blob, bottom_blob_bordered, opt);
+        if (bottom_blob_bordered.empty())
+            return -100;
+
+        w = bottom_blob_bordered.w;
+        h = bottom_blob_bordered.h;
+
+        int outw = (w - kernel_w) / stride_w + 1;
+        int outh = (h - kernel_h) / stride_h + 1;
+
+        top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const int maxk = kernel_w * kernel_h;
+
+        // kernel offsets
+        std::vector<int> _space_ofs(maxk);
+        int* space_ofs = &_space_ofs[0];
+        {
+            int p1 = 0;
+            int p2 = 0;
+            int gap = w - kernel_w;
+            for (int i = 0; i < kernel_h; i++)
+            {
+                for (int j = 0; j < kernel_w; j++)
+                {
+                    space_ofs[p1] = p2;
+                    p1++;
+                    p2++;
+                }
+                p2 += gap;
+            }
+        }
+
+        if (pooling_type == PoolMethod_MAX)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat m = bottom_blob_bordered.channel(q);
+                float* outptr = top_blob.channel(q);
+
+                for (int i = 0; i < outh; i++)
+                {
+                    for (int j = 0; j < outw; j++)
+                    {
+                        const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
+
+                        __m128 _max = (__m128)__lsx_vld(sptr, 0);
+
+                        for (int k = 0; k < maxk; k++)
+                        {
+                            __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0);
+                            _max = __lsx_vfmax_s(_max, _val);
+                        }
+
+                        __lsx_vst(_max, outptr + j * 4, 0);
+                    }
+
+                    outptr += outw * 4;
+                }
+            }
+        }
+        else if (pooling_type == PoolMethod_AVE)
+        {
+            if (avgpool_count_include_pad == 0)
+            {
+                int wtailpad = 0;
+                int htailpad = 0;
+
+                if (pad_mode == 0) // full padding
+                {
+                    wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
+                    htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
+                }
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob_bordered.channel(q);
+                    float* outptr = top_blob.channel(q);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int sy0 = i * stride_h;
+
+                        for (int j = 0; j < outw; j++)
+                        {
+                            int sx0 = j * stride_w;
+
+                            __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+                            int area = 0;
+
+                            for (int ki = 0; ki < kernel_h; ki++)
+                            {
+                                int sy = sy0 + ki;
+
+                                if (sy < pad_top)
+                                    continue;
+
+                                if (sy >= h - pad_bottom - htailpad)
+                                    break;
+
+                                for (int kj = 0; kj < kernel_w; kj++)
+                                {
+                                    int sx = sx0 + kj;
+
+                                    if (sx < pad_left)
+                                        continue;
+
+                                    if (sx >= w - pad_right - wtailpad)
+                                        break;
+
+                                    __m128 _val = (__m128)__lsx_vld(m.row(sy) + sx * 4, 0);
+                                    _sum = __lsx_vfadd_s(_sum, _val);
+                                    area += 1;
+                                }
+                            }
+
+                            __m128 _avg = __lsx_vfmul_s(_sum, __lsx_vreplfr2vr_s(1.f / area));
+                            __lsx_vst(_avg, outptr + j * 4, 0);
+                        }
+
+                        outptr += outw * 4;
+                    }
+                }
+            }
+            else // if (avgpool_count_include_pad == 1)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob_bordered.channel(q);
+                    float* outptr = top_blob.channel(q);
+
+                    const float inv_maxk = 1.f / maxk;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
+
+                            __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0);
+                                _sum = __lsx_vfadd_s(_sum, _val);
+                            }
+
+                            __m128 _avg = __lsx_vfmul_s(_sum, __lsx_vreplfr2vr_s(inv_maxk));
+                            __lsx_vst(_avg, outptr + j * 4, 0);
+                        }
+
+                        outptr += outw * 4;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+#endif // __loongarch_sx
+
+    return Pooling::forward(bottom_blob, top_blob, opt);
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/pooling_loongarch.h b/src/layer/loongarch/pooling_loongarch.h
new file mode 100644
index 00000000000..97e0c9ff2f7
--- /dev/null
+++ b/src/layer/loongarch/pooling_loongarch.h
@@ -0,0 +1,33 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_POOLING_LOONGARCH_H
+#define LAYER_POOLING_LOONGARCH_H
+
+#include "pooling.h"
+
+namespace ncnn {
+
+class Pooling_loongarch : virtual public Pooling
+{
+public:
+    Pooling_loongarch();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_POOLING_LOONGARCH_H
diff --git a/src/layer/loongarch/prelu_loongarch.cpp b/src/layer/loongarch/prelu_loongarch.cpp
new file mode 100644
index 00000000000..27cc0bc9d44
--- /dev/null
+++ b/src/layer/loongarch/prelu_loongarch.cpp
@@ -0,0 +1,193 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "prelu_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+PReLU_loongarch::PReLU_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int PReLU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int dims = bottom_top_blob.dims;
+    int elempack = bottom_top_blob.elempack;
+
+    if (dims == 1)
+    {
+        int w = bottom_top_blob.w * elempack;
+
+#if __loongarch_sx
+        int nn_w = w / 4;
+        int remain_w_start = nn_w * 4;
+#else
+        int remain_w_start = 0;
+#endif // __loongarch_sx
+
+        float* ptr = bottom_top_blob;
+
+        if (num_slope > 1)
+        {
+            const float* slope = slope_data;
+
+#if __loongarch_sx
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < nn_w; i++)
+            {
+                float* ptr0 = ptr + i * 4;
+
+                __m128 _p = (__m128)__lsx_vld(ptr0, 0);
+                __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _slope = (__m128)__lsx_vld(slope + i * 4, 0);
+                __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero);
+                __m128 _ps = __lsx_vfmul_s(_p, _slope);
+                _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask);
+                __lsx_vst(_p, ptr0, 0);
+            }
+#endif // __loongarch_sx
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = remain_w_start; i < w; i++)
+            {
+                float v = ptr[i];
+                if (v < 0.f)
+                    ptr[i] = v * slope[i];
+            }
+        }
+        else
+        {
+            const float slope = slope_data[0];
+
+#if __loongarch_sx
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < nn_w; i++)
+            {
+                float* ptr0 = ptr + i * 4;
+
+                __m128 _p = (__m128)__lsx_vld(ptr0, 0);
+                __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+                __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
+                __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero);
+                __m128 _ps = __lsx_vfmul_s(_p, _slope);
+                _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask);
+                __lsx_vst(_p, ptr0, 0);
+            }
+#endif // __loongarch_sx
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = remain_w_start; i < w; i++)
+            {
+                float v = ptr[i];
+                if (v < 0.f)
+                    ptr[i] = v * slope;
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_top_blob.w * elempack;
+        int h = bottom_top_blob.h;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            float* ptr = bottom_top_blob.row(i);
+
+            const float slope = num_slope > 1 ? slope_data[i] : slope_data[0];
+
+            int j = 0;
+#if __loongarch_sx
+            __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _slope = (elempack == 4 && num_slope > 1) ? (__m128)__lsx_vld((const float*)slope_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(slope);
+
+            for (; j + 3 < w; j += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero);
+                __m128 _ps = __lsx_vfmul_s(_p, _slope);
+                _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask);
+                __lsx_vst(_p, ptr, 0);
+
+                ptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; j < w; j++)
+            {
+                float v = *ptr;
+                if (v < 0.f)
+                    *ptr = v * slope;
+
+                ptr++;
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        int channels = bottom_top_blob.c;
+        int size = w * h * elempack;
+
+        const float* slope_data_ptr = slope_data;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+            float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0];
+
+            int i = 0;
+#if __loongarch_sx
+            __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _slope = (elempack == 4 && num_slope > 1) ? (__m128)__lsx_vld((const float*)slope_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(slope);
+
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero);
+                __m128 _ps = __lsx_vfmul_s(_p, _slope);
+                _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask);
+                __lsx_vst(_p, ptr, 0);
+
+                ptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                if (*ptr < 0)
+                    *ptr *= slope;
+
+                ptr++;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/prelu_loongarch.h b/src/layer/loongarch/prelu_loongarch.h
new file mode 100644
index 00000000000..97031bb0601
--- /dev/null
+++ b/src/layer/loongarch/prelu_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_PRELU_LOONGARCH_H
+#define LAYER_PRELU_LOONGARCH_H
+
+#include "prelu.h"
+
+namespace ncnn {
+
+class PReLU_loongarch : virtual public PReLU
+{
+public:
+    PReLU_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_PRELU_LOONGARCH_H
diff --git a/src/layer/loongarch/quantize_loongarch.cpp b/src/layer/loongarch/quantize_loongarch.cpp
new file mode 100644
index 00000000000..657ff2d06bf
--- /dev/null
+++ b/src/layer/loongarch/quantize_loongarch.cpp
@@ -0,0 +1,494 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "quantize_loongarch.h"
+
+#include <math.h>
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+Quantize_loongarch::Quantize_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int Quantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+
+#if __loongarch_sx
+    if (elempack == 4)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
+            int outw = w * elempack / out_elempack;
+
+            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const float scale = scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const float* ptr0 = (const float*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8(ptr0[0] * scale);
+                    outptr[1] = float2int8(ptr0[1] * scale);
+                    outptr[2] = float2int8(ptr0[2] * scale);
+                    outptr[3] = float2int8(ptr0[3] * scale);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const float* ptr0 = (const float*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]);
+                    outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]);
+                    outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]);
+                    outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]);
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
+            int outh = h * elempack / out_elempack;
+
+            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == 8)
+            {
+                if (scale_data_size == 1)
+                {
+                    __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i * 2);
+                        const float* ptr1 = bottom_blob.row(i * 2 + 1);
+                        signed char* outptr = top_blob.row<signed char>(i);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            __builtin_prefetch(ptr0 + 16);
+                            __builtin_prefetch(ptr1 + 16);
+                            __m128 _vlow = (__m128)__lsx_vld(ptr0, 0);
+                            __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0);
+                            _vlow = __lsx_vfmul_s(_vlow, _scale);
+                            _vhigh = __lsx_vfmul_s(_vhigh, _scale);
+                            *((int64_t*)outptr) = float2int8(_vlow, _vhigh);
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i * 2);
+                        const float* ptr1 = bottom_blob.row(i * 2 + 1);
+                        signed char* outptr = top_blob.row<signed char>(i);
+
+                        __m128 _scale0 = (__m128)__lsx_vld((const float*)scale_data + i * 8, 0);
+                        __m128 _scale1 = (__m128)__lsx_vld((const float*)scale_data + i * 8 + 4, 0);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            __builtin_prefetch(ptr0 + 16);
+                            __builtin_prefetch(ptr1 + 16);
+                            __m128 _vlow = (__m128)__lsx_vld(ptr0, 0);
+                            __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0);
+                            _vlow = __lsx_vfmul_s(_vlow, _scale0);
+                            _vhigh = __lsx_vfmul_s(_vhigh, _scale1);
+                            *((int64_t*)outptr) = float2int8(_vlow, _vhigh);
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (scale_data_size == 1)
+                {
+                    const float scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i);
+                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            outptr0[0] = float2int8(ptr0[0] * scale);
+                            outptr1[0] = float2int8(ptr0[1] * scale);
+                            outptr2[0] = float2int8(ptr0[2] * scale);
+                            outptr3[0] = float2int8(ptr0[3] * scale);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i);
+                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        const float s0 = scale_data[i * 4];
+                        const float s1 = scale_data[i * 4 + 1];
+                        const float s2 = scale_data[i * 4 + 2];
+                        const float s3 = scale_data[i * 4 + 3];
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            outptr0[0] = float2int8(ptr0[0] * s0);
+                            outptr1[0] = float2int8(ptr0[1] * s1);
+                            outptr2[0] = float2int8(ptr0[2] * s2);
+                            outptr3[0] = float2int8(ptr0[3] * s3);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
+            int outc = channels * elempack / out_elempack;
+
+            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == 8)
+            {
+                if (scale_data_size == 1)
+                {
+                    __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q * 2);
+                        const float* ptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* outptr = top_blob.channel(q);
+
+                        int i = 0;
+                        for (; i + 1 < size; i += 2)
+                        {
+                            __builtin_prefetch(ptr0 + 32);
+                            __builtin_prefetch(ptr1 + 32);
+                            __m128 _v0 = (__m128)__lsx_vld(ptr0, 0);
+                            __m128 _v1 = (__m128)__lsx_vld(ptr0 + 4, 0);
+                            __m128 _v2 = (__m128)__lsx_vld(ptr1, 0);
+                            __m128 _v3 = (__m128)__lsx_vld(ptr1 + 4, 0);
+                            _v0 = __lsx_vfmul_s(_v0, _scale);
+                            _v1 = __lsx_vfmul_s(_v1, _scale);
+                            _v2 = __lsx_vfmul_s(_v2, _scale);
+                            _v3 = __lsx_vfmul_s(_v3, _scale);
+                            *((int64_t*)outptr) = float2int8(_v0, _v2);
+                            *((int64_t*)(outptr + 8)) = float2int8(_v1, _v3);
+
+                            ptr0 += 8;
+                            ptr1 += 8;
+                            outptr += 16;
+                        }
+                        for (; i < size; i++)
+                        {
+                            __builtin_prefetch(ptr0 + 16);
+                            __builtin_prefetch(ptr1 + 16);
+                            __m128 _vlow = (__m128)__lsx_vld(ptr0, 0);
+                            __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0);
+                            _vlow = __lsx_vfmul_s(_vlow, _scale);
+                            _vhigh = __lsx_vfmul_s(_vhigh, _scale);
+                            *((int64_t*)outptr) = float2int8(_vlow, _vhigh);
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q * 2);
+                        const float* ptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* outptr = top_blob.channel(q);
+
+                        __m128 _scale0 = (__m128)__lsx_vld((const float*)scale_data + q * 8, 0);
+                        __m128 _scale1 = (__m128)__lsx_vld((const float*)scale_data + q * 8 + 4, 0);
+
+                        int i = 0;
+                        for (; i < size; i++)
+                        {
+                            __builtin_prefetch(ptr0 + 16);
+                            __builtin_prefetch(ptr1 + 16);
+                            __m128 _vlow = (__m128)__lsx_vld(ptr0, 0);
+                            __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0);
+                            _vlow = __lsx_vfmul_s(_vlow, _scale0);
+                            _vhigh = __lsx_vfmul_s(_vhigh, _scale1);
+                            *((int64_t*)outptr) = float2int8(_vlow, _vhigh);
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (scale_data_size == 1)
+                {
+                    const float scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q);
+                        signed char* outptr0 = top_blob.channel(q * 4);
+                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            outptr0[0] = float2int8(ptr0[0] * scale);
+                            outptr1[0] = float2int8(ptr0[1] * scale);
+                            outptr2[0] = float2int8(ptr0[2] * scale);
+                            outptr3[0] = float2int8(ptr0[3] * scale);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q);
+                        signed char* outptr0 = top_blob.channel(q * 4);
+                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
+
+                        const float s0 = scale_data[q * 4];
+                        const float s1 = scale_data[q * 4 + 1];
+                        const float s2 = scale_data[q * 4 + 2];
+                        const float s3 = scale_data[q * 4 + 3];
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            outptr0[0] = float2int8(ptr0[0] * s0);
+                            outptr1[0] = float2int8(ptr0[1] * s1);
+                            outptr2[0] = float2int8(ptr0[2] * s2);
+                            outptr3[0] = float2int8(ptr0[3] * s3);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+#endif // __loongarch_sx
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const float* ptr = bottom_blob;
+        signed char* outptr = top_blob;
+
+        if (scale_data_size == 1)
+        {
+            const float scale = scale_data[0];
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                outptr[i] = float2int8(ptr[i] * scale);
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                outptr[i] = float2int8(ptr[i] * scale_data[i]);
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            const float* ptr0 = bottom_blob.row(i);
+            signed char* outptr0 = top_blob.row<signed char>(i);
+
+            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+
+            for (int j = 0; j < w; j++)
+            {
+                *outptr0++ = float2int8(*ptr0++ * scale);
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            signed char* outptr = top_blob.channel(q);
+
+            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+
+            int i = 0;
+#if __loongarch_sx
+            __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
+            for (; i + 15 < size; i += 16)
+            {
+                __builtin_prefetch(ptr + 64);
+                __m128 _v0 = (__m128)__lsx_vld(ptr, 0);
+                __m128 _v1 = (__m128)__lsx_vld(ptr + 4, 0);
+                __m128 _v2 = (__m128)__lsx_vld(ptr + 8, 0);
+                __m128 _v3 = (__m128)__lsx_vld(ptr + 12, 0);
+                _v0 = __lsx_vfmul_s(_v0, _scale);
+                _v1 = __lsx_vfmul_s(_v1, _scale);
+                _v2 = __lsx_vfmul_s(_v2, _scale);
+                _v3 = __lsx_vfmul_s(_v3, _scale);
+                *((int64_t*)outptr) = float2int8(_v0, _v1);
+                *((int64_t*)(outptr + 8)) = float2int8(_v2, _v3);
+
+                ptr += 16;
+                outptr += 16;
+            }
+            for (; i + 7 < size; i += 8)
+            {
+                __builtin_prefetch(ptr + 32);
+                __m128 _v0 = (__m128)__lsx_vld(ptr, 0);
+                __m128 _v1 = (__m128)__lsx_vld(ptr + 4, 0);
+                _v0 = __lsx_vfmul_s(_v0, _scale);
+                _v1 = __lsx_vfmul_s(_v1, _scale);
+                *((int64_t*)outptr) = float2int8(_v0, _v1);
+
+                ptr += 8;
+                outptr += 8;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                *outptr++ = float2int8(*ptr++ * scale);
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/quantize_loongarch.h b/src/layer/loongarch/quantize_loongarch.h
new file mode 100644
index 00000000000..cae04aab171
--- /dev/null
+++ b/src/layer/loongarch/quantize_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_QUANTIZE_LOONGARCH_H
+#define LAYER_QUANTIZE_LOONGARCH_H
+
+#include "quantize.h"
+
+namespace ncnn {
+
+class Quantize_loongarch : virtual public Quantize
+{
+public:
+    Quantize_loongarch();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_QUANTIZE_LOONGARCH_H
diff --git a/src/layer/loongarch/relu_loongarch.cpp b/src/layer/loongarch/relu_loongarch.cpp
new file mode 100644
index 00000000000..eb478d3ae9b
--- /dev/null
+++ b/src/layer/loongarch/relu_loongarch.cpp
@@ -0,0 +1,98 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "relu_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+ReLU_loongarch::ReLU_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int ReLU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        if (slope == 0.f)
+        {
+            int i = 0;
+#if __loongarch_sx
+            __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                _p = __lsx_vfmax_s(_p, _zero);
+                __lsx_vst(_p, ptr, 0);
+
+                ptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                if (*ptr < 0)
+                    *ptr = 0;
+                ptr++;
+            }
+        }
+        else
+        {
+            int i = 0;
+#if __loongarch_sx
+            __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
+            __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero);
+                __m128 _ps = __lsx_vfmul_s(_p, _slope);
+                _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask);
+                __lsx_vst(_p, ptr, 0);
+
+                ptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                if (*ptr < 0)
+                    *ptr *= slope;
+                ptr++;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/relu_loongarch.h b/src/layer/loongarch/relu_loongarch.h
new file mode 100644
index 00000000000..445c6e8febc
--- /dev/null
+++ b/src/layer/loongarch/relu_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_RELU_LOONGARCH_H
+#define LAYER_RELU_LOONGARCH_H
+
+#include "relu.h"
+
+namespace ncnn {
+
+class ReLU_loongarch : virtual public ReLU
+{
+public:
+    ReLU_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_RELU_LOONGARCH_H
diff --git a/src/layer/loongarch/requantize_leakyrelu_pack4.h b/src/layer/loongarch/requantize_leakyrelu_pack4.h
new file mode 100644
index 00000000000..d6b49942660
--- /dev/null
+++ b/src/layer/loongarch/requantize_leakyrelu_pack4.h
@@ -0,0 +1,271 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void requantize_leakyrelu_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+    int outc = top_blob.c;
+    int out_elempack = top_blob.elempack;
+
+    int scale_in_data_size = scale_in_data.w;
+    int scale_out_data_size = scale_out_data.w;
+    int bias_data_size = bias_data.w;
+
+    // int8(leakyrelu(v * scale_in, slope) * scale_out)
+    // int8_leakyrelu(v * (scale_in * scale_out), slope)
+
+    // int8(leakyrelu(v * scale_in + bias, slope) * scale_out)
+    // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope)
+
+    if (out_elempack == 8)
+    {
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const int* intptr0 = bottom_blob.channel(q * 2);
+                const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                signed char* ptr = top_blob.channel(q);
+
+                __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+                __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+                __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+                __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+
+                __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
+                __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
+                __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    __builtin_prefetch(intptr0 + 64);
+                    __builtin_prefetch(intptr1 + 64);
+                    __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0));
+                    __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0));
+                    __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0));
+                    __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0));
+                    __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0));
+                    __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0));
+                    _v00 = __lsx_vfmul_s(_v00, _scale0);
+                    _v01 = __lsx_vfmul_s(_v01, _scale0);
+                    _v02 = __lsx_vfmul_s(_v02, _scale0);
+                    _v03 = __lsx_vfmul_s(_v03, _scale0);
+                    _v10 = __lsx_vfmul_s(_v10, _scale1);
+                    _v11 = __lsx_vfmul_s(_v11, _scale1);
+                    _v12 = __lsx_vfmul_s(_v12, _scale1);
+                    _v13 = __lsx_vfmul_s(_v13, _scale1);
+                    *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope);
+                    *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope);
+                    *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v02, _v12, _slope);
+                    *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v03, _v13, _slope);
+
+                    intptr0 += 16;
+                    intptr1 += 16;
+                    ptr += 32;
+                }
+                for (; i < size; i++)
+                {
+                    __builtin_prefetch(intptr0 + 16);
+                    __builtin_prefetch(intptr1 + 16);
+                    __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    _v0 = __lsx_vfmul_s(_v0, _scale0);
+                    _v1 = __lsx_vfmul_s(_v1, _scale1);
+                    *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
+
+                    intptr0 += 4;
+                    intptr1 += 4;
+                    ptr += 8;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const int* intptr0 = bottom_blob.channel(q * 2);
+                const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                signed char* ptr = top_blob.channel(q);
+
+                __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+                __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+                __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+                __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+                __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0);
+                __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0);
+
+                __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
+                __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
+                _bias0 = __lsx_vfmul_s(_bias0, _scale_out0);
+                _bias1 = __lsx_vfmul_s(_bias1, _scale_out1);
+                __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    __builtin_prefetch(intptr0 + 64);
+                    __builtin_prefetch(intptr1 + 64);
+                    __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0));
+                    __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0));
+                    __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0));
+                    __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0));
+                    __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0));
+                    __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0));
+                    _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0);
+                    _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0);
+                    _v02 = __lsx_vfmadd_s(_scale0, _v02, _bias0);
+                    _v03 = __lsx_vfmadd_s(_scale0, _v03, _bias0);
+                    _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1);
+                    _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1);
+                    _v12 = __lsx_vfmadd_s(_scale1, _v12, _bias1);
+                    _v13 = __lsx_vfmadd_s(_scale1, _v13, _bias1);
+                    *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope);
+                    *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope);
+                    *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v02, _v12, _slope);
+                    *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v03, _v13, _slope);
+
+                    intptr0 += 16;
+                    intptr1 += 16;
+                    ptr += 32;
+                }
+                for (; i + 1 < size; i += 2)
+                {
+                    __builtin_prefetch(intptr0 + 32);
+                    __builtin_prefetch(intptr1 + 32);
+                    __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0));
+                    __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0));
+                    _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0);
+                    _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0);
+                    _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1);
+                    _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1);
+                    *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope);
+                    *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope);
+
+                    intptr0 += 8;
+                    intptr1 += 8;
+                    ptr += 16;
+                }
+                for (; i < size; i++)
+                {
+                    __builtin_prefetch(intptr0 + 16);
+                    __builtin_prefetch(intptr1 + 16);
+                    __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                    _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                    *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
+
+                    intptr0 += 4;
+                    intptr1 += 4;
+                    ptr += 8;
+                }
+            }
+        }
+    }
+    if (out_elempack == 1)
+    {
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr0 = top_blob.channel(q * 4);
+                signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                signed char* ptr3 = top_blob.channel(q * 4 + 3);
+                signed char* vp;
+
+                __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0);
+                __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0);
+
+                __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out);
+                __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    __builtin_prefetch(intptr + 16);
+                    __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    _v = __lsx_vfmul_s(_v, _scale);
+                    __m128i v = float2int8leakyrelu(_v, _slope);
+                    vp = (signed char*)&v;
+                    ptr0[0] = vp[0];
+                    ptr1[0] = vp[1];
+                    ptr2[0] = vp[2];
+                    ptr3[0] = vp[3];
+
+                    intptr += 4;
+                    ptr0 += 1;
+                    ptr1 += 1;
+                    ptr2 += 1;
+                    ptr3 += 1;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr0 = top_blob.channel(q * 4);
+                signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                signed char* ptr3 = top_blob.channel(q * 4 + 3);
+                signed char* vp;
+
+                __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0);
+                __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0);
+                __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0);
+
+                __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out);
+                _bias = __lsx_vfmul_s(_bias, _scale_out);
+                __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    __builtin_prefetch(intptr + 16);
+                    __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                    __m128i v = float2int8leakyrelu(_v, _slope);
+                    vp = (signed char*)&v;
+                    ptr0[0] = vp[0];
+                    ptr1[0] = vp[1];
+                    ptr2[0] = vp[2];
+                    ptr3[0] = vp[3];
+
+                    intptr += 4;
+                    ptr0 += 1;
+                    ptr1 += 1;
+                    ptr2 += 1;
+                    ptr3 += 1;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/requantize_leakyrelu_pack8.h b/src/layer/loongarch/requantize_leakyrelu_pack8.h
new file mode 100644
index 00000000000..a2c4faed4f2
--- /dev/null
+++ b/src/layer/loongarch/requantize_leakyrelu_pack8.h
@@ -0,0 +1,188 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void requantize_leakyrelu_pack8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    int scale_in_data_size = scale_in_data.w;
+    int scale_out_data_size = scale_out_data.w;
+    int bias_data_size = bias_data.w;
+
+    // int8(leakyrelu(v * scale_in, slope) * scale_out)
+    // int8_leakyrelu(v * (scale_in * scale_out), slope)
+
+    // int8(leakyrelu(v * scale_in + bias, slope) * scale_out)
+    // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope)
+
+    if (bias_data_size == 0)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* intptr = bottom_blob.channel(q);
+            signed char* ptr = top_blob.channel(q);
+
+            __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+            __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+            __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+            __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+
+            __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
+            __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
+            __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
+
+            int i = 0;
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(intptr + 128);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0));
+                __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0));
+                __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0));
+                __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0));
+                _v0 = __lsx_vfmul_s(_v0, _scale0);
+                _v1 = __lsx_vfmul_s(_v1, _scale1);
+                _v2 = __lsx_vfmul_s(_v2, _scale0);
+                _v3 = __lsx_vfmul_s(_v3, _scale1);
+                _v4 = __lsx_vfmul_s(_v4, _scale0);
+                _v5 = __lsx_vfmul_s(_v5, _scale1);
+                _v6 = __lsx_vfmul_s(_v6, _scale0);
+                _v7 = __lsx_vfmul_s(_v7, _scale1);
+                *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
+                *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope);
+                *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v4, _v5, _slope);
+                *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v6, _v7, _slope);
+
+                intptr += 32;
+                ptr += 32;
+            }
+            for (; i + 1 < size; i += 2)
+            {
+                __builtin_prefetch(intptr + 64);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                _v0 = __lsx_vfmul_s(_v0, _scale0);
+                _v1 = __lsx_vfmul_s(_v1, _scale1);
+                _v2 = __lsx_vfmul_s(_v2, _scale0);
+                _v3 = __lsx_vfmul_s(_v3, _scale1);
+                *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
+                *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope);
+
+                intptr += 16;
+                ptr += 16;
+            }
+            for (; i < size; i++)
+            {
+                __builtin_prefetch(intptr + 32);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                _v0 = __lsx_vfmul_s(_v0, _scale0);
+                _v1 = __lsx_vfmul_s(_v1, _scale1);
+                *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
+
+                intptr += 8;
+                ptr += 8;
+            }
+        }
+    }
+    else
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* intptr = bottom_blob.channel(q);
+            signed char* ptr = top_blob.channel(q);
+
+            __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+            __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+            __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+            __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+            __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0);
+            __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0);
+
+            __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
+            __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
+            _bias0 = __lsx_vfmul_s(_bias0, _scale_out0);
+            _bias1 = __lsx_vfmul_s(_bias1, _scale_out1);
+            __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
+
+            int i = 0;
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(intptr + 128);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0));
+                __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0));
+                __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0));
+                __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0));
+                _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0);
+                _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1);
+                _v4 = __lsx_vfmadd_s(_scale0, _v4, _bias0);
+                _v5 = __lsx_vfmadd_s(_scale1, _v5, _bias1);
+                _v6 = __lsx_vfmadd_s(_scale0, _v6, _bias0);
+                _v7 = __lsx_vfmadd_s(_scale1, _v7, _bias1);
+                *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
+                *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope);
+                *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v4, _v5, _slope);
+                *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v6, _v7, _slope);
+
+                intptr += 32;
+                ptr += 32;
+            }
+            for (; i + 1 < size; i += 2)
+            {
+                __builtin_prefetch(intptr + 64);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0);
+                _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1);
+                *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
+                *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope);
+
+                intptr += 16;
+                ptr += 16;
+            }
+            for (; i < size; i++)
+            {
+                __builtin_prefetch(intptr + 32);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
+
+                intptr += 8;
+                ptr += 8;
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/requantize_loongarch.cpp b/src/layer/loongarch/requantize_loongarch.cpp
new file mode 100644
index 00000000000..556d20de4f6
--- /dev/null
+++ b/src/layer/loongarch/requantize_loongarch.cpp
@@ -0,0 +1,1386 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "requantize_loongarch.h"
+
+#include <math.h>
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_activation.h"
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+#if __loongarch_sx
+#include "requantize_leakyrelu_pack4.h"
+#include "requantize_leakyrelu_pack8.h"
+#include "requantize_relu_pack4.h"
+#include "requantize_relu_pack8.h"
+#endif // __loongarch_sx
+
+Requantize_loongarch::Requantize_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int Requantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+
+#if __loongarch_sx
+    if (elempack == 8)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+
+            top_blob.create(w, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_in_data_size == 1 && scale_out_data_size == 1)
+            {
+                __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]);
+                __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale_in);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_in);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias);
+                        _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0);
+                        __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+            }
+            else if (scale_in_data_size == 1 && scale_out_data_size > 1)
+            {
+                __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale_in);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_in);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias);
+                        _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+                        __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0);
+                        __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+            }
+            else if (scale_in_data_size > 1 && scale_out_data_size == 1)
+            {
+                __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale_in0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_in1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias);
+                        _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                        __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0);
+                        __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+            }
+            else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale_in0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_in1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias);
+                        _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+                        __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0);
+                        __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+
+            top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    signed char* ptr = top_blob.row<signed char>(i);
+
+                    __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                    __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                    __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                    __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale_in0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_in1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    signed char* ptr = top_blob.row<signed char>(i);
+
+                    __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                    __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                    __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                    __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+                    __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0);
+                    __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+
+            top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (activation_type == 1)
+            {
+                requantize_relu_pack8_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt);
+                return 0;
+            }
+
+            if (activation_type == 2 && activation_params[0] > 0.f)
+            {
+                requantize_leakyrelu_pack8_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt);
+                return 0;
+            }
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    signed char* ptr = top_blob.channel(q);
+
+                    __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+                    __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+                    __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+                    __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmul_s(_v0, _scale_in0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_in1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    signed char* ptr = top_blob.channel(q);
+
+                    __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+                    __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+                    __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+                    __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+                    __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0);
+                    __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        __builtin_prefetch(intptr + 32);
+                        __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                        _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0);
+                        _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1);
+                        _v0 = activation_ps(_v0, activation_type, activation_params);
+                        _v1 = activation_ps(_v1, activation_type, activation_params);
+                        _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                        _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                        *((int64_t*)ptr) = float2int8(_v0, _v1);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (elempack == 4)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
+            int outw = w * elempack / out_elempack;
+
+            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_in_data_size == 1 && scale_out_data_size == 1)
+            {
+                __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]);
+                __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale_in);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+            }
+            else if (scale_in_data_size == 1 && scale_out_data_size > 1)
+            {
+                __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale_in);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0);
+                        __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+            }
+            else if (scale_in_data_size > 1 && scale_out_data_size == 1)
+            {
+                __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale_in);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0);
+                        __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+            }
+            else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0);
+                        __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmul_s(_v, _scale_in);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0);
+                        __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0);
+                        __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0);
+                        __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+                        __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                        _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                        _v = activation_ps(_v, activation_type, activation_params);
+                        _v = __lsx_vfmul_s(_v, _scale_out);
+                        v16i8 v = (v16i8)float2int8(_v);
+                        ptr[0] = v[0];
+                        ptr[1] = v[1];
+                        ptr[2] = v[2];
+                        ptr[3] = v[3];
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
+            int outh = h * elempack / out_elempack;
+
+            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == 8)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const int* intptr0 = bottom_blob.row<const int>(i * 2);
+                        const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
+                        signed char* ptr = top_blob.row<signed char>(i);
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            __builtin_prefetch(intptr0 + 16);
+                            __builtin_prefetch(intptr1 + 16);
+                            __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                            __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                            _v0 = __lsx_vfmul_s(_v0, _scale_in0);
+                            _v1 = __lsx_vfmul_s(_v1, _scale_in1);
+                            _v0 = activation_ps(_v0, activation_type, activation_params);
+                            _v1 = activation_ps(_v1, activation_type, activation_params);
+                            _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                            _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                            *((int64_t*)ptr) = float2int8(_v0, _v1);
+
+                            intptr0 += 4;
+                            intptr1 += 4;
+                            ptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const int* intptr0 = bottom_blob.row<const int>(i * 2);
+                        const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
+                        signed char* ptr = top_blob.row<signed char>(i);
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0);
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0);
+                        __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0);
+                        __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            __builtin_prefetch(intptr0 + 16);
+                            __builtin_prefetch(intptr1 + 16);
+                            __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                            __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                            _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0);
+                            _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1);
+                            _v0 = activation_ps(_v0, activation_type, activation_params);
+                            _v1 = activation_ps(_v1, activation_type, activation_params);
+                            _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                            _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                            *((int64_t*)ptr) = float2int8(_v0, _v1);
+
+                            intptr0 += 4;
+                            intptr1 += 4;
+                            ptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const int* intptr = bottom_blob.row<const int>(i);
+                        signed char* ptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0);
+                        __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            __builtin_prefetch(intptr + 16);
+                            __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                            _v = __lsx_vfmul_s(_v, _scale_in);
+                            _v = activation_ps(_v, activation_type, activation_params);
+                            _v = __lsx_vfmul_s(_v, _scale_out);
+                            v16i8 v = (v16i8)float2int8(_v);
+                            ptr0[0] = v[0];
+                            ptr1[0] = v[1];
+                            ptr2[0] = v[2];
+                            ptr3[0] = v[3];
+
+                            intptr += 4;
+                            ptr0 += 1;
+                            ptr1 += 1;
+                            ptr2 += 1;
+                            ptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const int* intptr = bottom_blob.row<const int>(i);
+                        signed char* ptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0);
+                        __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0);
+                        __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 4, 0);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            __builtin_prefetch(intptr + 16);
+                            __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                            _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                            _v = activation_ps(_v, activation_type, activation_params);
+                            _v = __lsx_vfmul_s(_v, _scale_out);
+                            v16i8 v = (v16i8)float2int8(_v);
+                            ptr0[0] = v[0];
+                            ptr1[0] = v[1];
+                            ptr2[0] = v[2];
+                            ptr3[0] = v[3];
+
+                            intptr += 4;
+                            ptr0 += 1;
+                            ptr1 += 1;
+                            ptr2 += 1;
+                            ptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
+            int outc = channels * elempack / out_elempack;
+
+            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (activation_type == 1)
+            {
+                requantize_relu_pack4_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt);
+                return 0;
+            }
+
+            if (activation_type == 2 && activation_params[0] > 0.f)
+            {
+                requantize_leakyrelu_pack4_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt);
+                return 0;
+            }
+
+            if (out_elempack == 8)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const int* intptr0 = bottom_blob.channel(q * 2);
+                        const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* ptr = top_blob.channel(q);
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            __builtin_prefetch(intptr0 + 16);
+                            __builtin_prefetch(intptr1 + 16);
+                            __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                            __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                            _v0 = __lsx_vfmul_s(_v0, _scale_in0);
+                            _v1 = __lsx_vfmul_s(_v1, _scale_in1);
+                            _v0 = activation_ps(_v0, activation_type, activation_params);
+                            _v1 = activation_ps(_v1, activation_type, activation_params);
+                            _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                            _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                            *((int64_t*)ptr) = float2int8(_v0, _v1);
+
+                            intptr0 += 4;
+                            intptr1 += 4;
+                            ptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const int* intptr0 = bottom_blob.channel(q * 2);
+                        const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* ptr = top_blob.channel(q);
+
+                        __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+                        __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+                        __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+                        __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+                        __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0);
+                        __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            __builtin_prefetch(intptr0 + 16);
+                            __builtin_prefetch(intptr1 + 16);
+                            __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                            __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                            _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0);
+                            _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1);
+                            _v0 = activation_ps(_v0, activation_type, activation_params);
+                            _v1 = activation_ps(_v1, activation_type, activation_params);
+                            _v0 = __lsx_vfmul_s(_v0, _scale_out0);
+                            _v1 = __lsx_vfmul_s(_v1, _scale_out1);
+                            *((int64_t*)ptr) = float2int8(_v0, _v1);
+
+                            intptr0 += 4;
+                            intptr1 += 4;
+                            ptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const int* intptr = bottom_blob.channel(q);
+                        signed char* ptr0 = top_blob.channel(q * 4);
+                        signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* ptr3 = top_blob.channel(q * 4 + 3);
+
+                        __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0);
+                        __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            __builtin_prefetch(intptr + 16);
+                            __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                            _v = __lsx_vfmul_s(_v, _scale_in);
+                            _v = activation_ps(_v, activation_type, activation_params);
+                            _v = __lsx_vfmul_s(_v, _scale_out);
+                            v16i8 v = (v16i8)float2int8(_v);
+                            ptr0[0] = v[0];
+                            ptr1[0] = v[1];
+                            ptr2[0] = v[2];
+                            ptr3[0] = v[3];
+
+                            intptr += 4;
+                            ptr0 += 1;
+                            ptr1 += 1;
+                            ptr2 += 1;
+                            ptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const int* intptr = bottom_blob.channel(q);
+                        signed char* ptr0 = top_blob.channel(q * 4);
+                        signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* ptr3 = top_blob.channel(q * 4 + 3);
+
+                        __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0);
+                        __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0);
+                        __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            __builtin_prefetch(intptr + 16);
+                            __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                            _v = __lsx_vfmadd_s(_scale_in, _v, _bias);
+                            _v = activation_ps(_v, activation_type, activation_params);
+                            _v = __lsx_vfmul_s(_v, _scale_out);
+                            v16i8 v = (v16i8)float2int8(_v);
+                            ptr0[0] = v[0];
+                            ptr1[0] = v[1];
+                            ptr2[0] = v[2];
+                            ptr3[0] = v[3];
+
+                            intptr += 4;
+                            ptr0 += 1;
+                            ptr1 += 1;
+                            ptr2 += 1;
+                            ptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+#endif // __loongarch_sx
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const int* intptr = bottom_blob;
+        signed char* ptr = top_blob;
+
+        if (scale_in_data_size == 1 && scale_out_data_size == 1)
+        {
+            const float scale_in = scale_in_data[0];
+            const float scale_out = scale_out_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else if (scale_in_data_size == 1 && scale_out_data_size > 1)
+        {
+            const float scale_in = scale_in_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+        }
+        else if (scale_in_data_size > 1 && scale_out_data_size == 1)
+        {
+            const float scale_out = scale_out_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
+        {
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                signed char* ptr = top_blob.row<signed char>(i);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
+
+                for (int j = 0; j < w; j++)
+                {
+                    float v = intptr[j] * scale_in;
+                    ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                signed char* ptr = top_blob.row<signed char>(i);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i];
+
+                for (int j = 0; j < w; j++)
+                {
+                    float v = intptr[j] * scale_in + bias;
+                    ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr = top_blob.channel(q);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
+
+                for (int i = 0; i < size; i++)
+                {
+                    float v = intptr[i] * scale_in;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr = top_blob.channel(q);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q];
+
+                for (int i = 0; i < size; i++)
+                {
+                    float v = intptr[i] * scale_in + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/requantize_loongarch.h b/src/layer/loongarch/requantize_loongarch.h
new file mode 100644
index 00000000000..8175989959e
--- /dev/null
+++ b/src/layer/loongarch/requantize_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_REQUANTIZE_LOONGARCH_H
+#define LAYER_REQUANTIZE_LOONGARCH_H
+
+#include "requantize.h"
+
+namespace ncnn {
+
+class Requantize_loongarch : virtual public Requantize
+{
+public:
+    Requantize_loongarch();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_REQUANTIZE_LOONGARCH_H
diff --git a/src/layer/loongarch/requantize_relu_pack4.h b/src/layer/loongarch/requantize_relu_pack4.h
new file mode 100644
index 00000000000..2fba8dfc2e4
--- /dev/null
+++ b/src/layer/loongarch/requantize_relu_pack4.h
@@ -0,0 +1,267 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void requantize_relu_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+    int outc = top_blob.c;
+    int out_elempack = top_blob.elempack;
+
+    int scale_in_data_size = scale_in_data.w;
+    int scale_out_data_size = scale_out_data.w;
+    int bias_data_size = bias_data.w;
+
+    // int8(relu(v * scale_in) * scale_out)
+    // int8_relu(v * (scale_in * scale_out))
+
+    // int8(relu(v * scale_in + bias) * scale_out)
+    // int8_relu(v * (scale_in * scale_out) + (bias * scale_out))
+
+    if (out_elempack == 8)
+    {
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const int* intptr0 = bottom_blob.channel(q * 2);
+                const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                signed char* ptr = top_blob.channel(q);
+
+                __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+                __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+                __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+                __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+
+                __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
+                __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    __builtin_prefetch(intptr0 + 64);
+                    __builtin_prefetch(intptr1 + 64);
+                    __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0));
+                    __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0));
+                    __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0));
+                    __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0));
+                    __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0));
+                    __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0));
+                    _v00 = __lsx_vfmul_s(_v00, _scale0);
+                    _v01 = __lsx_vfmul_s(_v01, _scale0);
+                    _v02 = __lsx_vfmul_s(_v02, _scale0);
+                    _v03 = __lsx_vfmul_s(_v03, _scale0);
+                    _v10 = __lsx_vfmul_s(_v10, _scale1);
+                    _v11 = __lsx_vfmul_s(_v11, _scale1);
+                    _v12 = __lsx_vfmul_s(_v12, _scale1);
+                    _v13 = __lsx_vfmul_s(_v13, _scale1);
+                    *((int64_t*)ptr) = float2int8relu(_v00, _v10);
+                    *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11);
+                    *((int64_t*)(ptr + 16)) = float2int8relu(_v02, _v12);
+                    *((int64_t*)(ptr + 24)) = float2int8relu(_v03, _v13);
+
+                    intptr0 += 16;
+                    intptr1 += 16;
+                    ptr += 32;
+                }
+                for (; i < size; i++)
+                {
+                    __builtin_prefetch(intptr0 + 16);
+                    __builtin_prefetch(intptr1 + 16);
+                    __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    _v0 = __lsx_vfmul_s(_v0, _scale0);
+                    _v1 = __lsx_vfmul_s(_v1, _scale1);
+                    *((int64_t*)ptr) = float2int8relu(_v0, _v1);
+
+                    intptr0 += 4;
+                    intptr1 += 4;
+                    ptr += 8;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const int* intptr0 = bottom_blob.channel(q * 2);
+                const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                signed char* ptr = top_blob.channel(q);
+
+                __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+                __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+                __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+                __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+                __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0);
+                __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0);
+
+                __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
+                __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
+                _bias0 = __lsx_vfmul_s(_bias0, _scale_out0);
+                _bias1 = __lsx_vfmul_s(_bias1, _scale_out1);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    __builtin_prefetch(intptr0 + 64);
+                    __builtin_prefetch(intptr1 + 64);
+                    __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0));
+                    __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0));
+                    __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0));
+                    __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0));
+                    __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0));
+                    __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0));
+                    _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0);
+                    _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0);
+                    _v02 = __lsx_vfmadd_s(_scale0, _v02, _bias0);
+                    _v03 = __lsx_vfmadd_s(_scale0, _v03, _bias0);
+                    _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1);
+                    _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1);
+                    _v12 = __lsx_vfmadd_s(_scale1, _v12, _bias1);
+                    _v13 = __lsx_vfmadd_s(_scale1, _v13, _bias1);
+                    *((int64_t*)ptr) = float2int8relu(_v00, _v10);
+                    *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11);
+                    *((int64_t*)(ptr + 16)) = float2int8relu(_v02, _v12);
+                    *((int64_t*)(ptr + 24)) = float2int8relu(_v03, _v13);
+
+                    intptr0 += 16;
+                    intptr1 += 16;
+                    ptr += 32;
+                }
+                for (; i + 1 < size; i += 2)
+                {
+                    __builtin_prefetch(intptr0 + 32);
+                    __builtin_prefetch(intptr1 + 32);
+                    __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0));
+                    __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0));
+                    _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0);
+                    _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0);
+                    _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1);
+                    _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1);
+                    *((int64_t*)ptr) = float2int8relu(_v00, _v10);
+                    *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11);
+
+                    intptr0 += 8;
+                    intptr1 += 8;
+                    ptr += 16;
+                }
+                for (; i < size; i++)
+                {
+                    __builtin_prefetch(intptr0 + 16);
+                    __builtin_prefetch(intptr1 + 16);
+                    __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0));
+                    __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0));
+                    _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                    _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                    *((int64_t*)ptr) = float2int8relu(_v0, _v1);
+
+                    intptr0 += 4;
+                    intptr1 += 4;
+                    ptr += 8;
+                }
+            }
+        }
+    }
+    if (out_elempack == 1)
+    {
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr0 = top_blob.channel(q * 4);
+                signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                signed char* ptr3 = top_blob.channel(q * 4 + 3);
+                signed char* vp;
+
+                __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0);
+                __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0);
+
+                __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    __builtin_prefetch(intptr + 16);
+                    __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    _v = __lsx_vfmul_s(_v, _scale);
+                    __m128i v = float2int8relu(_v);
+                    vp = (signed char*)&v;
+                    ptr0[0] = vp[0];
+                    ptr1[0] = vp[1];
+                    ptr2[0] = vp[2];
+                    ptr3[0] = vp[3];
+
+                    intptr += 4;
+                    ptr0 += 1;
+                    ptr1 += 1;
+                    ptr2 += 1;
+                    ptr3 += 1;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr0 = top_blob.channel(q * 4);
+                signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                signed char* ptr3 = top_blob.channel(q * 4 + 3);
+                signed char* vp;
+
+                __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0);
+                __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0);
+                __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0);
+
+                __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out);
+                _bias = __lsx_vfmul_s(_bias, _scale_out);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    __builtin_prefetch(intptr + 16);
+                    __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                    _v = __lsx_vfmadd_s(_scale, _v, _bias);
+                    __m128i v = float2int8relu(_v);
+                    vp = (signed char*)&v;
+                    ptr0[0] = vp[0];
+                    ptr1[0] = vp[1];
+                    ptr2[0] = vp[2];
+                    ptr3[0] = vp[3];
+
+                    intptr += 4;
+                    ptr0 += 1;
+                    ptr1 += 1;
+                    ptr2 += 1;
+                    ptr3 += 1;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/requantize_relu_pack8.h b/src/layer/loongarch/requantize_relu_pack8.h
new file mode 100644
index 00000000000..3d2a45b45d0
--- /dev/null
+++ b/src/layer/loongarch/requantize_relu_pack8.h
@@ -0,0 +1,186 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void requantize_relu_pack8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    int scale_in_data_size = scale_in_data.w;
+    int scale_out_data_size = scale_out_data.w;
+    int bias_data_size = bias_data.w;
+
+    // int8(relu(v * scale_in) * scale_out)
+    // int8_relu(v * (scale_in * scale_out))
+
+    // int8(relu(v * scale_in + bias) * scale_out)
+    // int8_relu(v * (scale_in * scale_out) + (bias * scale_out))
+
+    if (bias_data_size == 0)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* intptr = bottom_blob.channel(q);
+            signed char* ptr = top_blob.channel(q);
+
+            __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+            __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+            __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+            __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+
+            __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
+            __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
+
+            int i = 0;
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(intptr + 128);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0));
+                __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0));
+                __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0));
+                __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0));
+                _v0 = __lsx_vfmul_s(_v0, _scale0);
+                _v1 = __lsx_vfmul_s(_v1, _scale1);
+                _v2 = __lsx_vfmul_s(_v2, _scale0);
+                _v3 = __lsx_vfmul_s(_v3, _scale1);
+                _v4 = __lsx_vfmul_s(_v4, _scale0);
+                _v5 = __lsx_vfmul_s(_v5, _scale1);
+                _v6 = __lsx_vfmul_s(_v6, _scale0);
+                _v7 = __lsx_vfmul_s(_v7, _scale1);
+                *((int64_t*)ptr) = float2int8relu(_v0, _v1);
+                *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3);
+                *((int64_t*)(ptr + 16)) = float2int8relu(_v4, _v5);
+                *((int64_t*)(ptr + 24)) = float2int8relu(_v6, _v7);
+
+                intptr += 32;
+                ptr += 32;
+            }
+            for (; i + 1 < size; i += 2)
+            {
+                __builtin_prefetch(intptr + 64);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                _v0 = __lsx_vfmul_s(_v0, _scale0);
+                _v1 = __lsx_vfmul_s(_v1, _scale1);
+                _v2 = __lsx_vfmul_s(_v2, _scale0);
+                _v3 = __lsx_vfmul_s(_v3, _scale1);
+                *((int64_t*)ptr) = float2int8relu(_v0, _v1);
+                *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3);
+
+                intptr += 16;
+                ptr += 16;
+            }
+            for (; i < size; i++)
+            {
+                __builtin_prefetch(intptr + 32);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                _v0 = __lsx_vfmul_s(_v0, _scale0);
+                _v1 = __lsx_vfmul_s(_v1, _scale1);
+                *((int64_t*)ptr) = float2int8relu(_v0, _v1);
+
+                intptr += 8;
+                ptr += 8;
+            }
+        }
+    }
+    else
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* intptr = bottom_blob.channel(q);
+            signed char* ptr = top_blob.channel(q);
+
+            __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0);
+            __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0);
+            __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0);
+            __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0);
+            __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0);
+            __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0);
+
+            __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
+            __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
+            _bias0 = __lsx_vfmul_s(_bias0, _scale_out0);
+            _bias1 = __lsx_vfmul_s(_bias1, _scale_out1);
+
+            int i = 0;
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(intptr + 128);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0));
+                __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0));
+                __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0));
+                __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0));
+                _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0);
+                _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1);
+                _v4 = __lsx_vfmadd_s(_scale0, _v4, _bias0);
+                _v5 = __lsx_vfmadd_s(_scale1, _v5, _bias1);
+                _v6 = __lsx_vfmadd_s(_scale0, _v6, _bias0);
+                _v7 = __lsx_vfmadd_s(_scale1, _v7, _bias1);
+                *((int64_t*)ptr) = float2int8relu(_v0, _v1);
+                *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3);
+                *((int64_t*)(ptr + 16)) = float2int8relu(_v4, _v5);
+                *((int64_t*)(ptr + 24)) = float2int8relu(_v6, _v7);
+
+                intptr += 32;
+                ptr += 32;
+            }
+            for (; i + 1 < size; i += 2)
+            {
+                __builtin_prefetch(intptr + 64);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0));
+                __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0));
+                _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0);
+                _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1);
+                *((int64_t*)ptr) = float2int8relu(_v0, _v1);
+                *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3);
+
+                intptr += 16;
+                ptr += 16;
+            }
+            for (; i < size; i++)
+            {
+                __builtin_prefetch(intptr + 32);
+                __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
+                __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
+                _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
+                _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
+                *((int64_t*)ptr) = float2int8relu(_v0, _v1);
+
+                intptr += 8;
+                ptr += 8;
+            }
+        }
+    }
+}
diff --git a/src/layer/loongarch/sigmoid_loongarch.cpp b/src/layer/loongarch/sigmoid_loongarch.cpp
new file mode 100644
index 00000000000..6d112804f26
--- /dev/null
+++ b/src/layer/loongarch/sigmoid_loongarch.cpp
@@ -0,0 +1,76 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "sigmoid_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+#include <math.h>
+
+namespace ncnn {
+
+Sigmoid_loongarch::Sigmoid_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int Sigmoid_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = (__m128)__lsx_vbitrevi_w((__m128i)_p, 31);
+            _p = exp_ps(_p);
+            _p = __lsx_vfadd_s(_p, _one);
+            __m128 _outp = __lsx_vfdiv_s(_one, _p);
+            __lsx_vst(_outp, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = 1.f / (1.f + exp(-*ptr));
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/sigmoid_loongarch.h b/src/layer/loongarch/sigmoid_loongarch.h
new file mode 100644
index 00000000000..b15aad235db
--- /dev/null
+++ b/src/layer/loongarch/sigmoid_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SIGMOID_LOONGARCH_H
+#define LAYER_SIGMOID_LOONGARCH_H
+
+#include "sigmoid.h"
+
+namespace ncnn {
+
+class Sigmoid_loongarch : virtual public Sigmoid
+{
+public:
+    Sigmoid_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SIGMOID_LOONGARCH_H
diff --git a/src/layer/loongarch/slice_loongarch.cpp b/src/layer/loongarch/slice_loongarch.cpp
new file mode 100644
index 00000000000..edd8656a4bb
--- /dev/null
+++ b/src/layer/loongarch/slice_loongarch.cpp
@@ -0,0 +1,371 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "slice_loongarch.h"
+
+namespace ncnn {
+
+Slice_loongarch::Slice_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Slice_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+    const int* slices_ptr = slices;
+    int positive_axis = axis < 0 ? dims + axis : axis;
+
+    if (dims == 1) // positive_axis == 0
+    {
+        // slice vector
+        int w = bottom_blob.w * elempack;
+        int q = 0;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            int slice = slices_ptr[i];
+            if (slice == -233)
+            {
+                slice = (w - q) / (top_blobs.size() - i);
+            }
+
+            int out_elempack = 1;
+#if __loongarch_sx
+            if (opt.use_packing_layout)
+                out_elempack = slice % 4 == 0 ? 4 : 1;
+#endif
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            Mat& top_blob = top_blobs[i];
+            top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            const float* ptr = (const float*)bottom_blob + q;
+            float* outptr = top_blob;
+            memcpy(outptr, ptr, top_blob.w * top_blob.elemsize);
+
+            q += slice;
+        }
+    }
+
+    if (dims == 2 && positive_axis == 0)
+    {
+        // slice image height
+        int w = bottom_blob.w;
+        int h = bottom_blob.h * elempack;
+
+        int q = 0;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            int slice = slices_ptr[i];
+            if (slice == -233)
+            {
+                slice = (h - q) / (top_blobs.size() - i);
+            }
+
+            int out_elempack = 1;
+#if __loongarch_sx
+            if (opt.use_packing_layout)
+                out_elempack = slice % 4 == 0 ? 4 : 1;
+#endif
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            Mat& top_blob = top_blobs[i];
+            top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            q += slice;
+        }
+
+        size_t out_elemsize = top_blobs[0].elemsize;
+        int out_elempack = top_blobs[0].elempack;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize);
+            out_elempack = std::min(out_elempack, top_blobs[i].elempack);
+        }
+
+        Mat bottom_blob_unpacked = bottom_blob;
+        if (elempack > out_elempack)
+        {
+            convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt);
+        }
+
+        const float* ptr = bottom_blob_unpacked;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            Mat& top_blob = top_blobs[i];
+
+            if (out_elempack == 1 && top_blob.elempack == 4)
+            {
+                for (int j = 0; j < top_blob.h; j++)
+                {
+                    const float* r0 = ptr;
+                    const float* r1 = ptr + w;
+                    const float* r2 = ptr + w * 2;
+                    const float* r3 = ptr + w * 3;
+
+                    float* outptr0 = top_blob.row(j);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        outptr0[0] = *r0++;
+                        outptr0[1] = *r1++;
+                        outptr0[2] = *r2++;
+                        outptr0[3] = *r3++;
+
+                        outptr0 += 4;
+                    }
+
+                    ptr += w * 4;
+                }
+            }
+            else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4)
+            {
+                int size = w * top_blob.h;
+
+                float* outptr = top_blob;
+                memcpy(outptr, ptr, size * top_blob.elemsize);
+
+                ptr += size * top_blob.elempack;
+            }
+        }
+    }
+
+    if (dims == 2 && positive_axis == 1)
+    {
+        // slice image width
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        int q = 0;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            int slice = slices_ptr[i];
+            if (slice == -233)
+            {
+                slice = (w - q) / (top_blobs.size() - i);
+            }
+
+            Mat& top_blob = top_blobs[i];
+            top_blob.create(slice, h, elemsize, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            q += slice;
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int j = 0; j < h; j++)
+        {
+            const float* ptr = bottom_blob.row(j);
+            for (size_t i = 0; i < top_blobs.size(); i++)
+            {
+                Mat& top_blob = top_blobs[i];
+
+                float* outptr = top_blob.row(j);
+                memcpy(outptr, ptr, top_blob.w * elemsize);
+
+                ptr += top_blob.w * elempack;
+            }
+        }
+    }
+
+    if (dims == 3 && positive_axis == 0)
+    {
+        // slice dim channel
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c * elempack;
+
+        int q = 0;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            int slice = slices_ptr[i];
+            if (slice == -233)
+            {
+                slice = (channels - q) / (top_blobs.size() - i);
+            }
+
+            int out_elempack = 1;
+#if __loongarch_sx
+            if (opt.use_packing_layout)
+                out_elempack = slice % 4 == 0 ? 4 : 1;
+#endif
+            size_t out_elemsize = elemsize / elempack * out_elempack;
+
+            Mat& top_blob = top_blobs[i];
+            top_blob.create(w, h, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            q += slice;
+        }
+
+        size_t out_elemsize = top_blobs[0].elemsize;
+        int out_elempack = top_blobs[0].elempack;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize);
+            out_elempack = std::min(out_elempack, top_blobs[i].elempack);
+        }
+
+        Mat bottom_blob_unpacked = bottom_blob;
+        if (elempack > out_elempack)
+        {
+            convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt);
+        }
+
+        int p = 0;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            Mat& top_blob = top_blobs[i];
+
+            if (out_elempack == 1 && top_blob.elempack == 4)
+            {
+                int size = top_blob.w * top_blob.h;
+
+                for (int q = 0; q < top_blob.c; q++)
+                {
+                    const float* r0 = bottom_blob_unpacked.channel(p);
+                    const float* r1 = bottom_blob_unpacked.channel(p + 1);
+                    const float* r2 = bottom_blob_unpacked.channel(p + 2);
+                    const float* r3 = bottom_blob_unpacked.channel(p + 3);
+
+                    float* outptr0 = top_blob.channel(q);
+
+                    for (int j = 0; j < size; j++)
+                    {
+                        outptr0[0] = *r0++;
+                        outptr0[1] = *r1++;
+                        outptr0[2] = *r2++;
+                        outptr0[3] = *r3++;
+
+                        outptr0 += 4;
+                    }
+
+                    p += 4;
+                }
+            }
+            else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4)
+            {
+                int size = top_blob.total();
+
+                const float* ptr = bottom_blob_unpacked.channel(p);
+                float* outptr = top_blob;
+                memcpy(outptr, ptr, size * top_blob.elemsize);
+
+                p += top_blob.c;
+            }
+        }
+    }
+
+    if (dims == 3 && positive_axis == 1)
+    {
+        // slice dim height
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+
+        int q = 0;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            int slice = slices_ptr[i];
+            if (slice == -233)
+            {
+                slice = (h - q) / (top_blobs.size() - i);
+            }
+
+            Mat& top_blob = top_blobs[i];
+            top_blob.create(w, slice, channels, elemsize, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            q += slice;
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < channels; p++)
+        {
+            const float* ptr = bottom_blob.channel(p);
+
+            for (size_t i = 0; i < top_blobs.size(); i++)
+            {
+                Mat& top_blob = top_blobs[i];
+
+                int size = top_blob.w * top_blob.h;
+
+                float* outptr = top_blob.channel(p);
+                memcpy(outptr, ptr, size * elemsize);
+
+                ptr += size * elempack;
+            }
+        }
+    }
+
+    if (dims == 3 && positive_axis == 2)
+    {
+        // slice dim width
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+
+        int q = 0;
+        for (size_t i = 0; i < top_blobs.size(); i++)
+        {
+            int slice = slices_ptr[i];
+            if (slice == -233)
+            {
+                slice = (w - q) / (top_blobs.size() - i);
+            }
+
+            Mat& top_blob = top_blobs[i];
+            top_blob.create(slice, h, channels, elemsize, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            q += slice;
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < channels; p++)
+        {
+            const float* ptr = bottom_blob.channel(p);
+
+            for (int j = 0; j < h; j++)
+            {
+                for (size_t i = 0; i < top_blobs.size(); i++)
+                {
+                    Mat& top_blob = top_blobs[i];
+
+                    float* outptr = top_blob.channel(p).row(j);
+                    memcpy(outptr, ptr, top_blob.w * elemsize);
+
+                    ptr += top_blob.w * elempack;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/slice_loongarch.h b/src/layer/loongarch/slice_loongarch.h
new file mode 100644
index 00000000000..b42138ba418
--- /dev/null
+++ b/src/layer/loongarch/slice_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SLICE_LOONGARCH_H
+#define LAYER_SLICE_LOONGARCH_H
+
+#include "slice.h"
+
+namespace ncnn {
+
+class Slice_loongarch : virtual public Slice
+{
+public:
+    Slice_loongarch();
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SLICE_LOONGARCH_H
diff --git a/src/layer/loongarch/softmax_loongarch.cpp b/src/layer/loongarch/softmax_loongarch.cpp
new file mode 100644
index 00000000000..88b49559754
--- /dev/null
+++ b/src/layer/loongarch/softmax_loongarch.cpp
@@ -0,0 +1,175 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "softmax_loongarch.h"
+
+#include <float.h>
+#include <math.h>
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+int Softmax_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int dims = bottom_top_blob.dims;
+    size_t elemsize = bottom_top_blob.elemsize;
+    int positive_axis = axis < 0 ? dims + axis : axis;
+
+    if (dims != 3 || positive_axis != 0)
+        return Softmax::forward_inplace(bottom_top_blob, opt);
+
+    // value = exp( value - global max value )
+    // sum all value
+    // value = value / sum
+
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    Mat max;
+    max.create(w, h, elemsize, opt.workspace_allocator);
+    if (max.empty())
+        return -100;
+    max.fill(-FLT_MAX);
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        float* maxptr = max;
+
+        for (int i = 0; i < size; i++)
+        {
+            maxptr[i] = std::max(maxptr[i], ptr[i]);
+        }
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        float* maxptr = max;
+
+#if __loongarch_sx
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __loongarch_sx
+
+#if __loongarch_sx
+        for (; nn > 0; nn--)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __m128 _max = (__m128)__lsx_vld(maxptr, 0);
+
+            _p = exp_ps(__lsx_vfsub_s(_p, _max));
+
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+            maxptr += 4;
+        }
+#endif // __loongarch_sx
+
+        for (; remain > 0; remain--)
+        {
+            *ptr = exp(*ptr - *maxptr);
+
+            ptr++;
+            maxptr++;
+        }
+    }
+
+    Mat sum;
+    sum.create(w, h, elemsize, opt.workspace_allocator);
+    if (sum.empty())
+        return -100;
+    sum.fill(0.f);
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        float* sumptr = sum;
+
+#if __loongarch_sx
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __loongarch_sx
+
+#if __loongarch_sx
+        for (; nn > 0; nn--)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __m128 _sum = (__m128)__lsx_vld(sumptr, 0);
+            _sum = __lsx_vfadd_s(_sum, _p);
+            __lsx_vst(_sum, sumptr, 0);
+
+            ptr += 4;
+            sumptr += 4;
+        }
+#endif // __loongarch_sx
+
+        for (; remain > 0; remain--)
+        {
+            *sumptr += *ptr;
+
+            ptr++;
+            sumptr++;
+        }
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+        float* sumptr = sum;
+
+#if __loongarch_sx
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __loongarch_sx
+
+#if __loongarch_sx
+        for (; nn > 0; nn--)
+        {
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            __m128 _sum = (__m128)__lsx_vld(sumptr, 0);
+            _p = __lsx_vfdiv_s(_p, _sum);
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+            sumptr += 4;
+        }
+#endif // __loongarch_sx
+
+        for (; remain > 0; remain--)
+        {
+            *ptr /= *sumptr;
+
+            ptr++;
+            sumptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/softmax_loongarch.h b/src/layer/loongarch/softmax_loongarch.h
new file mode 100644
index 00000000000..3c8272a6412
--- /dev/null
+++ b/src/layer/loongarch/softmax_loongarch.h
@@ -0,0 +1,30 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SOFTMAX_LOONGARCH_H
+#define LAYER_SOFTMAX_LOONGARCH_H
+
+#include "softmax.h"
+
+namespace ncnn {
+
+class Softmax_loongarch : virtual public Softmax
+{
+public:
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SOFTMAX_LOONGARCH_H
diff --git a/src/layer/loongarch/swish_loongarch.cpp b/src/layer/loongarch/swish_loongarch.cpp
new file mode 100644
index 00000000000..9c9005de6fc
--- /dev/null
+++ b/src/layer/loongarch/swish_loongarch.cpp
@@ -0,0 +1,70 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "swish_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+#include <math.h>
+
+namespace ncnn {
+
+Swish_loongarch::Swish_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int Swish_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128i _p = __lsx_vld(ptr, 0);
+            _p = (__m128i)__lsx_vfdiv_s((__m128)_p, __lsx_vfadd_s(_one, exp_ps((__m128)__lsx_vbitrevi_w(_p, 31))));
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = *ptr / (1.f + exp(-*ptr));
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/swish_loongarch.h b/src/layer/loongarch/swish_loongarch.h
new file mode 100644
index 00000000000..b8d0b80f01e
--- /dev/null
+++ b/src/layer/loongarch/swish_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SWISH_LOONGARCH_H
+#define LAYER_SWISH_LOONGARCH_H
+
+#include "swish.h"
+
+namespace ncnn {
+
+class Swish_loongarch : virtual public Swish
+{
+public:
+    Swish_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SWISH_LOONGARCH_H
diff --git a/src/layer/loongarch/tanh_loongarch.cpp b/src/layer/loongarch/tanh_loongarch.cpp
new file mode 100644
index 00000000000..13227fa71e3
--- /dev/null
+++ b/src/layer/loongarch/tanh_loongarch.cpp
@@ -0,0 +1,69 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "tanh_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+#include <math.h>
+
+namespace ncnn {
+
+TanH_loongarch::TanH_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int TanH_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = tanh_ps(_p);
+            __lsx_vst(_p, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = tanh(*ptr);
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/tanh_loongarch.h b/src/layer/loongarch/tanh_loongarch.h
new file mode 100644
index 00000000000..ecbab01ec8f
--- /dev/null
+++ b/src/layer/loongarch/tanh_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_TANH_LOONGARCH_H
+#define LAYER_TANH_LOONGARCH_H
+
+#include "tanh.h"
+
+namespace ncnn {
+
+class TanH_loongarch : virtual public TanH
+{
+public:
+    TanH_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_TANH_LOONGARCH_H
diff --git a/src/layer/loongarch/unaryop_loongarch.cpp b/src/layer/loongarch/unaryop_loongarch.cpp
new file mode 100644
index 00000000000..892c4dc4260
--- /dev/null
+++ b/src/layer/loongarch/unaryop_loongarch.cpp
@@ -0,0 +1,427 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "unaryop_loongarch.h"
+
+#include <math.h>
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#include "lsx_mathfun.h"
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+UnaryOp_loongarch::UnaryOp_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+template<typename Op>
+static int unary_op_inplace(Mat& a, const Option& opt)
+{
+    Op op;
+
+    int w = a.w;
+    int h = a.h;
+    int d = a.d;
+    int channels = a.c;
+    int elempack = a.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = a.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128 _p = (__m128)__lsx_vld(ptr, 0);
+            _p = op.func_pack4(_p);
+            __lsx_vst(_p, ptr, 0);
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = op.func(*ptr);
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+namespace UnaryOp_loongarch_functor {
+
+struct unary_op_abs
+{
+    float func(const float& x) const
+    {
+        return (float)fabs(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return (__m128)__lsx_vbitclri_w((__m128i)x, 31);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_neg
+{
+    float func(const float& x) const
+    {
+        return -x;
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return (__m128)__lsx_vbitrevi_w((__m128i)x, 31);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_floor
+{
+    float func(const float& x) const
+    {
+        return (float)floor(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        // TODO msa optimize
+        float tmp[4];
+        __lsx_vst(x, tmp, 0);
+        tmp[0] = floor(tmp[0]);
+        tmp[1] = floor(tmp[1]);
+        tmp[2] = floor(tmp[2]);
+        tmp[3] = floor(tmp[3]);
+        return (__m128)__lsx_vld(tmp, 0);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_ceil
+{
+    float func(const float& x) const
+    {
+        return (float)ceil(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        // TODO msa optimize
+        float tmp[4];
+        __lsx_vst(x, tmp, 0);
+        tmp[0] = ceil(tmp[0]);
+        tmp[1] = ceil(tmp[1]);
+        tmp[2] = ceil(tmp[2]);
+        tmp[3] = ceil(tmp[3]);
+        return (__m128)__lsx_vld(tmp, 0);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_square
+{
+    float func(const float& x) const
+    {
+        return x * x;
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return __lsx_vfmul_s(x, x);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_sqrt
+{
+    float func(const float& x) const
+    {
+        return (float)sqrt(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return __lsx_vfsqrt_s(x);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_rsqrt
+{
+    float func(const float& x) const
+    {
+        return (float)(1.f / sqrt(x));
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return __lsx_vfrsqrt_s(x);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_exp
+{
+    float func(const float& x) const
+    {
+        return (float)exp(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return exp_ps(x);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_log
+{
+    float func(const float& x) const
+    {
+        return (float)log(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return log_ps(x);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_sin
+{
+    float func(const float& x) const
+    {
+        return (float)sin(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        // TODO msa optimize
+        float tmp[4];
+        __lsx_vst(x, tmp, 0);
+        tmp[0] = sin(tmp[0]);
+        tmp[1] = sin(tmp[1]);
+        tmp[2] = sin(tmp[2]);
+        tmp[3] = sin(tmp[3]);
+        return (__m128)__lsx_vld(tmp, 0);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_cos
+{
+    float func(const float& x) const
+    {
+        return (float)cos(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        // TODO msa optimize
+        float tmp[4];
+        __lsx_vst(x, tmp, 0);
+        tmp[0] = cos(tmp[0]);
+        tmp[1] = cos(tmp[1]);
+        tmp[2] = cos(tmp[2]);
+        tmp[3] = cos(tmp[3]);
+        return (__m128)__lsx_vld(tmp, 0);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_tan
+{
+    float func(const float& x) const
+    {
+        return (float)tan(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        // TODO msa optimize
+        float tmp[4];
+        __lsx_vst(x, tmp, 0);
+        tmp[0] = tan(tmp[0]);
+        tmp[1] = tan(tmp[1]);
+        tmp[2] = tan(tmp[2]);
+        tmp[3] = tan(tmp[3]);
+        return (__m128)__lsx_vld(tmp, 0);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_asin
+{
+    float func(const float& x) const
+    {
+        return (float)asin(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        // TODO msa optimize
+        float tmp[4];
+        __lsx_vst(x, tmp, 0);
+        tmp[0] = asin(tmp[0]);
+        tmp[1] = asin(tmp[1]);
+        tmp[2] = asin(tmp[2]);
+        tmp[3] = asin(tmp[3]);
+        return (__m128)__lsx_vld(tmp, 0);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_acos
+{
+    float func(const float& x) const
+    {
+        return (float)acos(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        // TODO msa optimize
+        float tmp[4];
+        __lsx_vst(x, tmp, 0);
+        tmp[0] = acos(tmp[0]);
+        tmp[1] = acos(tmp[1]);
+        tmp[2] = acos(tmp[2]);
+        tmp[3] = acos(tmp[3]);
+        return (__m128)__lsx_vld(tmp, 0);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_atan
+{
+    float func(const float& x) const
+    {
+        return (float)atan(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        // TODO msa optimize
+        float tmp[4];
+        __lsx_vst(x, tmp, 0);
+        tmp[0] = atan(tmp[0]);
+        tmp[1] = atan(tmp[1]);
+        tmp[2] = atan(tmp[2]);
+        tmp[3] = atan(tmp[3]);
+        return (__m128)__lsx_vld(tmp, 0);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_reciprocal
+{
+    float func(const float& x) const
+    {
+        return 1.f / x;
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return __lsx_vfrecip_s(x);
+    }
+#endif // __loongarch_sx
+};
+
+struct unary_op_tanh
+{
+    float func(const float& x) const
+    {
+        return (float)tanh(x);
+    }
+#if __loongarch_sx
+    __m128 func_pack4(const __m128& x) const
+    {
+        return tanh_ps(x);
+    }
+#endif // __loongarch_sx
+};
+
+} // namespace UnaryOp_loongarch_functor
+
+int UnaryOp_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    using namespace UnaryOp_loongarch_functor;
+
+    if (op_type == Operation_ABS)
+        return unary_op_inplace<unary_op_abs>(bottom_top_blob, opt);
+
+    if (op_type == Operation_NEG)
+        return unary_op_inplace<unary_op_neg>(bottom_top_blob, opt);
+
+    if (op_type == Operation_FLOOR)
+        return unary_op_inplace<unary_op_floor>(bottom_top_blob, opt);
+
+    if (op_type == Operation_CEIL)
+        return unary_op_inplace<unary_op_ceil>(bottom_top_blob, opt);
+
+    if (op_type == Operation_SQUARE)
+        return unary_op_inplace<unary_op_square>(bottom_top_blob, opt);
+
+    if (op_type == Operation_SQRT)
+        return unary_op_inplace<unary_op_sqrt>(bottom_top_blob, opt);
+
+    if (op_type == Operation_RSQRT)
+        return unary_op_inplace<unary_op_rsqrt>(bottom_top_blob, opt);
+
+    if (op_type == Operation_EXP)
+        return unary_op_inplace<unary_op_exp>(bottom_top_blob, opt);
+
+    if (op_type == Operation_LOG)
+        return unary_op_inplace<unary_op_log>(bottom_top_blob, opt);
+
+    if (op_type == Operation_SIN)
+        return unary_op_inplace<unary_op_sin>(bottom_top_blob, opt);
+
+    if (op_type == Operation_COS)
+        return unary_op_inplace<unary_op_cos>(bottom_top_blob, opt);
+
+    if (op_type == Operation_TAN)
+        return unary_op_inplace<unary_op_tan>(bottom_top_blob, opt);
+
+    if (op_type == Operation_ASIN)
+        return unary_op_inplace<unary_op_asin>(bottom_top_blob, opt);
+
+    if (op_type == Operation_ACOS)
+        return unary_op_inplace<unary_op_acos>(bottom_top_blob, opt);
+
+    if (op_type == Operation_ATAN)
+        return unary_op_inplace<unary_op_atan>(bottom_top_blob, opt);
+
+    if (op_type == Operation_RECIPROCAL)
+        return unary_op_inplace<unary_op_reciprocal>(bottom_top_blob, opt);
+
+    if (op_type == Operation_TANH)
+        return unary_op_inplace<unary_op_tanh>(bottom_top_blob, opt);
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/unaryop_loongarch.h b/src/layer/loongarch/unaryop_loongarch.h
new file mode 100644
index 00000000000..8170bec50cf
--- /dev/null
+++ b/src/layer/loongarch/unaryop_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_UNARYOP_LOONGARCH_H
+#define LAYER_UNARYOP_LOONGARCH_H
+
+#include "unaryop.h"
+
+namespace ncnn {
+
+class UnaryOp_loongarch : virtual public UnaryOp
+{
+public:
+    UnaryOp_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_UNARYOP_LOONGARCH_H
diff --git a/src/layer_registry.h.in b/src/layer_registry.h.in
index 99c1d8336f4..6947ecce5d1 100644
--- a/src/layer_registry.h.in
+++ b/src/layer_registry.h.in
@@ -28,6 +28,12 @@ static const layer_registry_entry layer_registry_msa[] = {
 };
 #endif // NCNN_RUNTIME_CPU && NCNN_MSA
 
+#if NCNN_RUNTIME_CPU && NCNN_LSX
+static const layer_registry_entry layer_registry_lsx[] = {
+@layer_registry_lsx@
+};
+#endif // NCNN_RUNTIME_CPU && NCNN_LSX
+
 #if NCNN_RUNTIME_CPU && NCNN_RVV
 static const layer_registry_entry layer_registry_rvv[] = {
 @layer_registry_rvv@
diff --git a/src/mat.h b/src/mat.h
index e534def504f..c6f59ef4268 100644
--- a/src/mat.h
+++ b/src/mat.h
@@ -29,6 +29,9 @@
 #if __mips_msa
 #include <msa.h>
 #endif
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif
 #if __riscv_vector
 #include <riscv_vector.h>
 #include "cpu.h" // cpu_riscv_vlenb()
@@ -128,6 +131,9 @@ class NCNN_EXPORT Mat
 #if __mips_msa
     void fill(v4f32 _v);
 #endif // __mips_msa
+#if __loongarch_sx
+    void fill(__m128 _v);
+#endif //__loongarch_sx
 #if __riscv_vector
     void fill(vfloat32m1_t _v);
     void fill(vuint16m1_t _v);
@@ -1067,6 +1073,18 @@ NCNN_FORCEINLINE void Mat::fill(v4f32 _v)
 }
 #endif // __mips_msa
 
+#if __loongarch_sx
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __lsx_vst(_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __loongarch_sx
 #if __riscv_vector
 NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v)
 {
diff --git a/src/platform.h.in b/src/platform.h.in
index 755f8294bc2..219cff4aada 100644
--- a/src/platform.h.in
+++ b/src/platform.h.in
@@ -55,6 +55,7 @@
 #cmakedefine01 NCNN_ARM86SVEF32MM
 #endif // __aarch64__
 #cmakedefine01 NCNN_MSA
+#cmakedefine01 NCNN_LSX
 #cmakedefine01 NCNN_MMI
 #cmakedefine01 NCNN_RVV
 #cmakedefine01 NCNN_INT8